vendor.golang.org.x.net.html.token.go Maven / Gradle / Ivy
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bytes"
"errors"
"io"
"strconv"
"strings"
"golang.org/x/net/html/atom"
)
// A TokenType is the type of a Token.
type TokenType uint32
const (
// ErrorToken means that an error occurred during tokenization.
ErrorToken TokenType = iota
// TextToken means a text node.
TextToken
// A StartTagToken looks like .
StartTagToken
// An EndTagToken looks like .
EndTagToken
// A SelfClosingTagToken tag looks like
.
SelfClosingTagToken
// A CommentToken looks like .
CommentToken
// A DoctypeToken looks like
DoctypeToken
)
// ErrBufferExceeded means that the buffering limit was exceeded.
var ErrBufferExceeded = errors.New("max buffer exceeded")
// String returns a string representation of the TokenType.
func (t TokenType) String() string {
switch t {
case ErrorToken:
return "Error"
case TextToken:
return "Text"
case StartTagToken:
return "StartTag"
case EndTagToken:
return "EndTag"
case SelfClosingTagToken:
return "SelfClosingTag"
case CommentToken:
return "Comment"
case DoctypeToken:
return "Doctype"
}
return "Invalid(" + strconv.Itoa(int(t)) + ")"
}
// An Attribute is an attribute namespace-key-value triple. Namespace is
// non-empty for foreign attributes like xlink, Key is alphabetic (and hence
// does not contain escapable characters like '&', '<' or '>'), and Val is
// unescaped (it looks like "a"
case EndTagToken:
return "" + t.tagString() + ">"
case SelfClosingTagToken:
return "<" + t.tagString() + "/>"
case CommentToken:
return ""
case DoctypeToken:
return ""
}
return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
}
// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
// the end is exclusive.
type span struct {
start, end int
}
// A Tokenizer returns a stream of HTML Tokens.
type Tokenizer struct {
// r is the source of the HTML text.
r io.Reader
// tt is the TokenType of the current token.
tt TokenType
// err is the first error encountered during tokenization. It is possible
// for tt != Error && err != nil to hold: this means that Next returned a
// valid token but the subsequent Next call will return an error token.
// For example, if the HTML text input was just "plain", then the first
// Next call would set z.err to io.EOF but return a TextToken, and all
// subsequent Next calls would return an ErrorToken.
// err is never reset. Once it becomes non-nil, it stays non-nil.
err error
// readErr is the error returned by the io.Reader r. It is separate from
// err because it is valid for an io.Reader to return (n int, err1 error)
// such that n > 0 && err1 != nil, and callers should always process the
// n > 0 bytes before considering the error err1.
readErr error
// buf[raw.start:raw.end] holds the raw bytes of the current token.
// buf[raw.end:] is buffered input that will yield future tokens.
raw span
buf []byte
// maxBuf limits the data buffered in buf. A value of 0 means unlimited.
maxBuf int
// buf[data.start:data.end] holds the raw bytes of the current token's data:
// a text token's text, a tag token's tag name, etc.
data span
// pendingAttr is the attribute key and value currently being tokenized.
// When complete, pendingAttr is pushed onto attr. nAttrReturned is
// incremented on each call to TagAttr.
pendingAttr [2]span
attr [][2]span
nAttrReturned int
// rawTag is the "script" in "" that closes the next token. If
// non-empty, the subsequent call to Next will return a raw or RCDATA text
// token: one that treats "" as text instead of an element.
// rawTag's contents are lower-cased.
rawTag string
// textIsRaw is whether the current text token's data is not escaped.
textIsRaw bool
// convertNUL is whether NUL bytes in the current token's data should
// be converted into \ufffd replacement characters.
convertNUL bool
// allowCDATA is whether CDATA sections are allowed in the current context.
allowCDATA bool
}
// AllowCDATA sets whether or not the tokenizer recognizes as
// the text "foo". The default value is false, which means to recognize it as
// a bogus comment "" instead.
//
// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and
// only if tokenizing foreign content, such as MathML and SVG. However,
// tracking foreign-contentness is difficult to do purely in the tokenizer,
// as opposed to the parser, due to HTML integration points: an