All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.gwt.thirdparty.streamhtmlparser.util.HtmlUtils Maven / Gradle / Ivy

There is a newer version: 7.4.0
Show newest version
/*
 * Copyright (C) 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.gwt.thirdparty.streamhtmlparser.util;

import com.google.gwt.thirdparty.guava.common.collect.ImmutableSortedSet;

import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

/**
 * Utility functions for HTML and Javascript that are most likely
 * not interesting to users outside this package.
 *
 * 

The HtmlParser will be open-sourced hence we took the * decision to keep these utilities in this package as well as not to * leverage others that may exist in the google3 code base. * *

The functionality exposed is designed to be 100% compatible with * the corresponding logic in the C-version of the HtmlParser as such * we are particularly concerned with cross-language compatibility. * *

Note: The words {@code Javascript} and {@code ECMAScript} are used * interchangeably unless otherwise noted. */ public final class HtmlUtils { /** * static utility class */ private HtmlUtils() { } // COV_NF_LINE /** * Indicates the type of content contained in the {@code content} HTML * attribute of the {@code meta} HTML tag. Used by * {@link HtmlUtils#parseContentAttributeForUrl(String)}. *

The values are: *

    *
  • {@code NONE} if it does not contain a URL in the expected format. *
  • {@code URL_START} if it contains a URL but hasn't seen any of * its contents. *
  • {@code URL} if it contains a URL and has seen at least some of * its contents. *
*/ public enum META_REDIRECT_TYPE { NONE, URL_START, URL } /** * A regular expression matching the format of a {@code content} attribute * that contains a URL. Used by {@link #parseContentAttributeForUrl}. */ private static final String META_REDIRECT_REGEX = "^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?"; // Safe for use by concurrent threads so we compile once. private static final Pattern META_REDIRECT_PATTERN = Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE); /** * Set of keywords that can precede a regular expression literal. Taken from: * * Language Syntax * *

The token {@code void} was added to the list. Several keywords are * defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic * simple we do not differentiate on the version and bundle them all together. */ private static final Set REGEXP_TOKEN_PREFIXS = ImmutableSortedSet.of( "abstract", "break", "case", "catch", "class", "const", "continue", "debugger", "default", "delete", "do", "else", "enum", "eval", "export", "extends", "field", "final", "finally", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "native", "new", "package", "private", "protected", "public", "return", "static", "switch", "synchronized", "throw", "throws", "transient", "try", "typeof", "var", "void", "volatile", "while", "with"); /** * Set of all HTML attributes which expect a URI (as the value). * Index of Attributes */ private static final Set ATTRIBUTE_EXPECTS_URI = ImmutableSortedSet.of( "action", "archive", "background", "cite", "classid", "codebase", "data", "dynsrc", "href", "longdesc", "src", "usemap"); /** * Set of {@code Character}s considered whitespace in Javascript. * See {@link #isJavascriptWhitespace(char)} */ private static final Set JAVASCRIPT_WHITESPACE = ImmutableSortedSet.of( '\u0009', /* Tab \t */ '\n', /* Line-Feed 0x0A */ '\u000B', /* Vertical Tab 0x0B */ '\u000C', /* Form Feed \f */ '\r', /* Carriage Return 0x0D */ ' ', /* Space 0x20 */ '\u00A0', /* Non-breaking space 0xA0 */ '\u2028', /* Line separator */ '\u2029'); /* Paragraph separator */ /** * Set of {@code Character}s considered whitespace in HTML. * See {@link #isHtmlSpace(char)} */ private static final Set HTML_WHITESPACE = ImmutableSortedSet.of( ' ', '\t', '\n', '\r', '\u200B'); /** * Determines if the HTML attribute specified expects javascript * for its value. Such is the case for example with the {@code onclick} * attribute. * *

Currently returns {@code true} for any attribute name that starts * with "on" which is not exactly correct but we trust a developer to * not use non-spec compliant attribute names (e.g. onbogus). * * @param attribute the name of an HTML attribute * @return {@code false} if the input is null or is not an attribute * that expects javascript code; {@code true} */ public static boolean isAttributeJavascript(String attribute) { return ((attribute != null) && attribute.startsWith("on")); } /** * Determines if the HTML attribute specified expects a {@code style} * for its value. Currently this is only true for the {@code style} * HTML attribute. * * @param attribute the name of an HTML attribute * @return {@code true} iff the attribute name is one that expects a * style for a value; otherwise {@code false} */ public static boolean isAttributeStyle(String attribute) { return "style".equals(attribute); } /** * Determines if the HTML attribute specified expects a {@code URI} * for its value. For example, both {@code href} and {@code src} * expect a {@code URI} but {@code style} does not. Returns * {@code false} if the attribute given was {@code null}. * * @param attribute the name of an HTML attribute * @return {@code true} if the attribute name is one that expects * a URI for a value; otherwise {@code null} * * @see #ATTRIBUTE_EXPECTS_URI */ public static boolean isAttributeUri(String attribute) { return ATTRIBUTE_EXPECTS_URI.contains(attribute); } /** * Determines if the specified character is an HTML whitespace character. * A character is an HTML whitespace character if and only if it is one * of the characters below. *

    *
  • A Space character *
  • A Tab character *
  • A Line feed character *
  • A Carriage Return character *
  • A Zero-Width Space character *
* * Note: The list includes the zero-width space (​) * which is not included in the C version. * * @param chr the {@code char} to check * @return {@code true} if the character is an HTML whitespace character * * White space */ public static boolean isHtmlSpace(char chr) { return HTML_WHITESPACE.contains(chr); } /** * Determines if the specified character is an ECMAScript whitespace or line * terminator character. A character is a whitespace or line terminator if * and only if it is one of the characters below: *
    *
  • A white-space character (Tab, Vertical Tab, * Form Feed, Space, * No-break space) *
  • A line terminator character (Line Feed, * Carriage Return, Line separator, * Paragraph Separator). *
* *

Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in * particular, this list is quite different from that in * Character.isWhitespace. * * ECMAScript Language Specification * * @param chr the {@code char} to check * @return {@code true} or {@code false} * */ public static boolean isJavascriptWhitespace(char chr) { return JAVASCRIPT_WHITESPACE.contains(chr); } /** * Determines if the specified character is a valid character in an * ECMAScript identifier. This determination is currently not exact, * in particular: *

    *
  • It does not accept Unicode letters, only ASCII ones. *
  • It does not distinguish between the first character of an identifier * (which cannot contain numbers) and subsequent characters. *
  • *
* * We are considering leveraging Character.isJavaIdentifierStart * and Character.isJavaIdentifierPart given that Java * and Javascript follow similar identifier naming rules but we lose * compatibility with the C-version. * * @param chr {@code char} to check * @return {@code true} if the {@code chr} is a Javascript whitespace * character; otherwise {@code false} */ public static boolean isJavascriptIdentifier(char chr) { return ((chr >= 'a' && chr <= 'z') || (chr >= 'A' && chr <= 'Z') || (chr >= '0' && chr <= '9') || chr == '_' || chr == '$'); } /** * Determines if the input token provided is a valid token prefix to a * javascript regular expression. The token argument is compared against * a {@code Set} of identifiers that can precede a regular expression in the * javascript grammar, and returns {@code true} if the provided * {@code String} is in that {@code Set}. * * @param input the {@code String} token to check * @return {@code true} iff the token is a valid prefix of a regexp */ public static boolean isJavascriptRegexpPrefix(String input) { return REGEXP_TOKEN_PREFIXS.contains(input); } /** * Encodes the specified character using Ascii for convenient insertion into * a single-quote enclosed {@code String}. Printable characters * are returned as-is. Carriage Return, Line Feed, Horizontal Tab, * back-slash and single quote are all backslash-escaped. All other characters * are returned hex-encoded. * * @param chr {@code char} to encode * @return an Ascii-friendly encoding of the given {@code char} */ public static String encodeCharForAscii(char chr) { if (chr == '\'') { return "\\'"; } else if (chr == '\\') { return "\\\\"; } else if (chr >= 32 && chr <= 126) { return String.format("%c", chr); } else if (chr == '\n') { return "\\n"; } else if (chr == '\r') { return "\\r"; } else if (chr == '\t') { return "\\t"; } else { // Cannot apply a precision specifier for integral types. Specifying // 0-padded hex-encoding with minimum width of two. return String.format("\\u%04x", (int)chr); } } /** * Parses the given {@code String} to determine if it contains a URL in the * format followed by the {@code content} attribute of the {@code meta} * HTML tag. * *

This function expects to receive the value of the {@code content} HTML * attribute. This attribute takes on different meanings depending on the * value of the {@code http-equiv} HTML attribute of the same {@code meta} * tag. Since we may not have access to the {@code http-equiv} attribute, * we instead rely on parsing the given value to determine if it contains * a URL. * * The specification of the {@code meta} HTML tag can be found in: * http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh * *

We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the * value contains a URL and whether we are at the start of the URL or past * the start. We are at the start of the URL if and only if one of the two * conditions below is true: *

    *
  • The given input does not contain any characters from the URL proper. * Example "5; URL=". *
  • The given input only contains the optional leading single or double * quote leading the URL. Example "5; URL='". *
  • *
* *

Examples: *

    *
  • Example of a complete {@code meta} tag where the {@code content} * attribute contains a URL [we are not at the start of the URL]: *
       * <meta http-equiv="refresh" content="5; URL=http://www.google.com">
       * 
    *
  • Example of a complete {@code meta} tag where the {@code content} * attribute contains a URL [we are at the start of the URL]: *
       * <meta http-equiv="refresh" content="5; URL=">
       * 
    *
  • Example of a complete {@code meta} tag where the {@code content} * attribute does not contain a URL: *
       * <meta http-equiv="content-type" content="text/html">
       * 
    *
* * @param value {@code String} to parse * @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence * of a URL in the given value */ public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) { if (value == null) return META_REDIRECT_TYPE.NONE; Matcher matcher = META_REDIRECT_PATTERN.matcher(value); if (!matcher.find()) return META_REDIRECT_TYPE.NONE; // We have more content. if (value.length() > matcher.end()) return META_REDIRECT_TYPE.URL; return META_REDIRECT_TYPE.URL_START; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy