All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.owasp.html.HtmlLexer Maven / Gradle / Ivy

There is a newer version: 20240325.1
Show newest version
// Copyright (c) 2011, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

package org.owasp.html;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import java.util.LinkedList;
import java.util.NoSuchElementException;
import java.util.Set;

import javax.annotation.concurrent.NotThreadSafe;

/**
 * A flexible lexer for HTML.
 * This is hairy code, but it is outside the TCB for the HTML sanitizer.
 *
 * @author Mike Samuel ([email protected])
 */
@NotThreadSafe
final class HtmlLexer extends AbstractTokenStream {
  private final String input;
  private final HtmlInputSplitter splitter;
  private State state = State.OUTSIDE_TAG;

  public HtmlLexer(String input) {
    this.input = input;
    this.splitter = new HtmlInputSplitter(input);
  }

  /**
   * Normalize case of names that are not name-spaced.  This lower-cases HTML
   * element and attribute names, but not ones for embedded SVG or MATHML.
   */
  static String canonicalName(String elementOrAttribName) {
    return elementOrAttribName.indexOf(':') >= 0
        ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName);
  }

  /**
   * An FSM that lets us reclassify text tokens inside tags as attribute
   * names/values
   */
  private static enum State {
    OUTSIDE_TAG,
    IN_TAG,
    SAW_NAME,
    SAW_EQ,
    ;
  }

  /**
   * Makes sure that this.token contains a token if one is available.
   * This may require fetching and combining multiple tokens from the underlying
   * splitter.
   */
  @Override
  protected HtmlToken produce() {
    HtmlToken token = readToken();
    if (token == null) { return null; }

    switch (token.type) {

      // Keep track of whether we're inside a tag or not.
      case TAGBEGIN:
        state = State.IN_TAG;
        break;
      case TAGEND:
        if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) {
          // Distinguish  from
          // 
          pushbackToken(token);
          state = State.IN_TAG;
          return HtmlToken.instance(
              token.start, token.start, HtmlTokenType.ATTRVALUE);
        }

        state = State.OUTSIDE_TAG;
        break;

      // Drop ignorable tokens by zeroing out the one received and recursing
      case IGNORABLE:
        return produce();

      // collapse adjacent text nodes if we're outside a tag, or otherwise,
      // Recognize attribute names and values.
      default:
        switch (state) {
          case OUTSIDE_TAG:
            if (HtmlTokenType.TEXT == token.type
                || HtmlTokenType.UNESCAPED == token.type) {
              token = collapseSubsequent(token);
            }
            break;
          case IN_TAG:
            if (HtmlTokenType.TEXT == token.type
                && !token.tokenInContextMatches(input, "=")) {
              // Reclassify as attribute name
              token = HtmlInputSplitter.reclassify(
                  token, HtmlTokenType.ATTRNAME);
              state = State.SAW_NAME;
            }
            break;
          case SAW_NAME:
            if (HtmlTokenType.TEXT == token.type) {
              if (token.tokenInContextMatches(input, "=")) {
                state = State.SAW_EQ;
                // Skip the '=' token
                return produce();
              } else {
                // Reclassify as attribute name
                token = HtmlInputSplitter.reclassify(
                    token, HtmlTokenType.ATTRNAME);
              }
            } else {
              state = State.IN_TAG;
            }
            break;
          case SAW_EQ:
            if (HtmlTokenType.TEXT == token.type
                || HtmlTokenType.QSTRING == token.type) {
              if (HtmlTokenType.TEXT == token.type) {
                // Collapse adjacent text nodes to properly handle
                //   
                //   
                token = collapseAttributeName(token);
              }
              // Reclassify as value
              token = HtmlInputSplitter.reclassify(
                  token, HtmlTokenType.ATTRVALUE);
              state = State.IN_TAG;
            }
            break;
        }
        break;
    }

    return token;
  }

  /**
   * Collapses all the following tokens of the same type into this.token.
   */
  private HtmlToken collapseSubsequent(HtmlToken token) {
    HtmlToken collapsed = token;
    for (HtmlToken next;
         (next= peekToken(0)) != null && next.type == token.type;
         readToken()) {
      collapsed = join(collapsed, next);
    }
    return collapsed;
  }

  private HtmlToken collapseAttributeName(HtmlToken token) {
    // We want to collapse tokens into the value that are not parts of an
    // attribute value.  We should include any space or text adjacent to the
    // value, but should stop at any of the following constructions:
    //   space end-of-file              e.g. name=foo_
    //   space valueless-attrib-name    e.g. name=foo checked
    //   space tag-end                  e.g. name=foo />
    //   space text space? '='          e.g. name=foo bar=
    int nToMerge = 0;
    for (HtmlToken t; (t = peekToken(nToMerge)) != null;) {
      if (t.type == HtmlTokenType.IGNORABLE) {
        HtmlToken tok = peekToken(nToMerge + 1);
        if (tok == null) { break; }
        if (tok.type != HtmlTokenType.TEXT) { break; }
        if (isValuelessAttribute(input.substring(tok.start, tok.end))) {
          break;
        }
        HtmlToken eq = peekToken(nToMerge + 2);
        if (eq != null && eq.type == HtmlTokenType.IGNORABLE) {
          eq = peekToken(nToMerge + 3);
        }
        if (eq == null || eq.tokenInContextMatches(input, "=")) {
          break;
        }
      } else if (t.type != HtmlTokenType.TEXT) {
        break;
      }
      ++nToMerge;
    }
    if (nToMerge == 0) { return token; }

    int end = token.end;
    do {
      end = readToken().end;
    } while (--nToMerge > 0);

    return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT);
  }

  private static HtmlToken join(HtmlToken a, HtmlToken b) {
    return HtmlToken.instance(a.start, b.end, a.type);
  }

  private final LinkedList lookahead = Lists.newLinkedList();
  private HtmlToken readToken() {
    if (!lookahead.isEmpty()) {
      return lookahead.remove();
    } else if (splitter.hasNext()) {
      return splitter.next();
    } else {
      return null;
    }
  }

  private HtmlToken peekToken(int i) {
    while (lookahead.size() <= i && splitter.hasNext()) {
      lookahead.add(splitter.next());
    }
    return lookahead.size() > i ? lookahead.get(i) : null;
  }

  private void pushbackToken(HtmlToken token) {
    lookahead.addFirst(token);
  }

  /** Can the attribute appear in HTML without a value. */
  private static boolean isValuelessAttribute(String attribName) {
    boolean valueless = VALUELESS_ATTRIB_NAMES.contains(
        Strings.toLowerCase(attribName));
    return valueless;
  }

  // From http://issues.apache.org/jira/browse/XALANC-519
  private static final Set VALUELESS_ATTRIB_NAMES = ImmutableSet.of(
      "checked", "compact", "declare", "defer", "disabled",
      "ismap", "multiple", "nohref", "noresize", "noshade",
      "nowrap", "readonly", "selected");
}

/**
 * A token stream that breaks a character stream into 
 * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}
 * tokens.  The matching of attribute names and values is done in a later step.
 */
final class HtmlInputSplitter extends AbstractTokenStream {
  /** The source of HTML character data. */
  private final String input;
  /** An offset into input. */
  private int offset;
  /** True iff the current character is inside a tag. */
  private boolean inTag;
  /**
   * True if inside a script, xmp, listing, or similar tag whose content does
   * not follow the normal escaping rules.
   */
  private boolean inEscapeExemptBlock;

  /**
   * Null or the name of the close tag required to end the current escape exempt
   * block.
   * Preformatted tags include <script>, <xmp>, etc. that may
   * contain unescaped HTML input.
   */
  private String escapeExemptTagName = null;

  private HtmlTextEscapingMode textEscapingMode;

  public HtmlInputSplitter(String input) {
    this.input = input;
  }

  /**
   * Make sure that there is a token ready to yield in this.token.
   */
  @Override
  protected HtmlToken produce() {
    HtmlToken token = parseToken();
    if (null == token) { return null; }

    // Handle escape-exempt blocks.
    // The parse() method is only dimly aware of escape-excempt blocks, so
    // here we detect the beginning and ends of escape exempt blocks, and
    // reclassify as UNESCAPED, any tokens that appear in the middle.
    if (inEscapeExemptBlock) {
      if (token.type != HtmlTokenType.SERVERCODE) {
        // classify RCDATA as text since it can contain entities
        token = reclassify(
            token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA
                    ? HtmlTokenType.TEXT
                    : HtmlTokenType.UNESCAPED));
      }
    } else {
      switch (token.type) {
        case TAGBEGIN:
          {
            String canonTagName = canonicalName(
                token.start + 1, token.end);
            if (HtmlTextEscapingMode.isTagFollowedByLiteralContent(
                    canonTagName)) {
              this.escapeExemptTagName = canonTagName;
              this.textEscapingMode = HtmlTextEscapingMode.getModeForTag(
                  canonTagName);
            }
            break;
          }
        case TAGEND:
          this.inEscapeExemptBlock = null != this.escapeExemptTagName;
          break;
        default:
          break;
      }
    }
    return token;
  }

  /**
   * States for a state machine for optimistically identifying tags and other
   * html/xml/phpish structures.
   */
  private static enum State {
    TAGNAME,
    SLASH,
    BANG,
    BANG_DASH,
    COMMENT,
    COMMENT_DASH,
    COMMENT_DASH_DASH,
    DIRECTIVE,
    DONE,
    BOGUS_COMMENT,
    SERVER_CODE,
    SERVER_CODE_PCT,
    ;
  }

  private HtmlToken lastNonIgnorable = null;
  /**
   * Breaks the character stream into tokens.
   * This method returns a stream of tokens such that each token starts where
   * the last token ended.
   *
   * 

This property is useful as it allows fetch to collapse and reclassify * ranges of tokens based on state that is easy to maintain there. * *

Later passes are responsible for throwing away useless tokens. */ private HtmlToken parseToken() { int start = offset; int limit = input.length(); if (start == limit) { return null; } int end = start + 1; HtmlTokenType type; char ch = input.charAt(start); if (inTag) { if ('>' == ch) { type = HtmlTokenType.TAGEND; inTag = false; } else if ('/' == ch) { if (end != limit && '>' == input.charAt(end)) { type = HtmlTokenType.TAGEND; inTag = false; ++end; } else { type = HtmlTokenType.TEXT; } } else if ('=' == ch) { type = HtmlTokenType.TEXT; } else if ('"' == ch || '\'' == ch) { type = HtmlTokenType.QSTRING; int delim = ch; for (; end < limit; ++end) { if (input.charAt(end) == delim) { ++end; break; } } } else if (!Character.isWhitespace(ch)) { type = HtmlTokenType.TEXT; for (; end < limit; ++end) { ch = input.charAt(end); // End a text chunk before /> if ((lastNonIgnorable == null || !lastNonIgnorable.tokenInContextMatches(input, "=")) && '/' == ch && end + 1 < limit && '>' == input.charAt(end + 1)) { break; } else if ('>' == ch || '=' == ch || Character.isWhitespace(ch)) { break; } else if ('"' == ch || '\'' == ch) { if (end + 1 < limit) { char ch2 = input.charAt(end + 1); if (Character.isWhitespace(ch2) || ch2 == '>' || ch2 == '/') { ++end; break; } } } } } else { // We skip whitespace tokens inside tag bodies. type = HtmlTokenType.IGNORABLE; while (end < limit && Character.isWhitespace(input.charAt(end))) { ++end; } } } else { if (ch == '<') { if (end == limit) { type = HtmlTokenType.TEXT; } else { ch = input.charAt(end); type = null; State state = null; switch (ch) { case '/': // close tag? state = State.SLASH; ++end; break; case '!': // Comment or declaration if (!this.inEscapeExemptBlock) { state = State.BANG; } ++end; break; case '?': if (!this.inEscapeExemptBlock) { state = State.BOGUS_COMMENT; } ++end; break; case '%': state = State.SERVER_CODE; ++end; break; default: if (isIdentStart(ch) && !this.inEscapeExemptBlock) { state = State.TAGNAME; ++end; } else if ('<' == ch) { type = HtmlTokenType.TEXT; } else { ++end; } break; } if (null != state) { charloop: while (end < limit) { ch = input.charAt(end); switch (state) { case TAGNAME: if (Character.isWhitespace(ch) || '>' == ch || '/' == ch || '<' == ch) { // End processing of an escape exempt block when we see // a corresponding end tag. if (this.inEscapeExemptBlock && '/' == input.charAt(start + 1) && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT && canonicalName(start + 2, end) .equals(escapeExemptTagName)) { this.inEscapeExemptBlock = false; this.escapeExemptTagName = null; this.textEscapingMode = null; } type = HtmlTokenType.TAGBEGIN; // Don't process content as attributes if we're inside // an escape exempt block. inTag = !this.inEscapeExemptBlock; state = State.DONE; break charloop; } break; case SLASH: if (Character.isLetter(ch)) { state = State.TAGNAME; } else { if ('<' == ch) { type = HtmlTokenType.TEXT; } else { ++end; } break charloop; } break; case BANG: if ('-' == ch) { state = State.BANG_DASH; } else { state = State.DIRECTIVE; } break; case BANG_DASH: if ('-' == ch) { state = State.COMMENT; } else { state = State.DIRECTIVE; } break; case COMMENT: if ('-' == ch) { state = State.COMMENT_DASH; } break; case COMMENT_DASH: state = ('-' == ch) ? State.COMMENT_DASH_DASH : State.COMMENT_DASH; break; case COMMENT_DASH_DASH: if ('>' == ch) { state = State.DONE; type = HtmlTokenType.COMMENT; } else if ('-' == ch) { state = State.COMMENT_DASH_DASH; } else { state = State.COMMENT_DASH; } break; case DIRECTIVE: if ('>' == ch) { type = HtmlTokenType.DIRECTIVE; state = State.DONE; } break; case BOGUS_COMMENT: if ('>' == ch) { type = HtmlTokenType.QMARKMETA; state = State.DONE; } break; case SERVER_CODE: if ('%' == ch) { state = State.SERVER_CODE_PCT; } break; case SERVER_CODE_PCT: if ('>' == ch) { type = HtmlTokenType.SERVERCODE; state = State.DONE; } else if ('%' != ch) { state = State.SERVER_CODE; } break; case DONE: throw new AssertionError( "Unexpectedly DONE while lexing HTML token stream"); } ++end; if (State.DONE == state) { break; } } if (end == limit) { switch (state) { case DONE: break; case BOGUS_COMMENT: type = HtmlTokenType.QMARKMETA; break; case COMMENT: case COMMENT_DASH: case COMMENT_DASH_DASH: type = HtmlTokenType.COMMENT; break; case DIRECTIVE: case SERVER_CODE: case SERVER_CODE_PCT: type = HtmlTokenType.SERVERCODE; break; case TAGNAME: type = HtmlTokenType.TAGBEGIN; break; default: type = HtmlTokenType.TEXT; break; } } } } } else { type = null; } } if (null == type) { while (end < limit && '<' != input.charAt(end)) { ++end; } type = HtmlTokenType.TEXT; } offset = end; HtmlToken result = HtmlToken.instance(start, end, type); if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; } return result; } private String canonicalName(int start, int end) { return HtmlLexer.canonicalName(input.substring(start, end)); } private static boolean isIdentStart(char ch) { return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a'); } static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) { return HtmlToken.instance(token.start, token.end, type); } } /** * A TokenStream that lazily fetches one token at a time. * * @author Mike Samuel ([email protected]) */ abstract class AbstractTokenStream implements TokenStream { private HtmlToken tok; public final boolean hasNext() { if (tok == null) { tok = produce(); } return tok != null; } public HtmlToken next() { if (this.tok == null) { this.tok = produce(); } HtmlToken t = this.tok; if (t == null) { throw new NoSuchElementException(); } this.tok = null; return t; } protected abstract HtmlToken produce(); }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy