diff --git a/compare.go b/compare.go new file mode 100644 index 0000000..4e31dbd --- /dev/null +++ b/compare.go @@ -0,0 +1,68 @@ +package xmltree + +import ( + "bytes" + "encoding/xml" + "sort" +) + +// Equal returns true if two xmltree.Elements are equal, ignoring +// differences in white space, sub-element order, and namespace prefixes. +func Equal(a, b *Element) bool { + return equal(a, b, 0) +} + +type byName []Element + +func (l byName) Len() int { return len(l) } +func (l byName) Less(i, j int) bool { + return l[i].Name.Space+l[i].Name.Local < l[j].Name.Space+l[j].Name.Local +} +func (l byName) Swap(i, j int) { l[i], l[j] = l[j], l[i] } + +func equal(a, b *Element, depth int) bool { + const maxDepth = 1000 + if depth > maxDepth { + return false + } + if !equalElement(a, b) { + return false + } + if len(a.Children) != len(b.Children) { + return false + } + if len(a.Children) == 0 { + return bytes.Equal(bytes.TrimSpace(a.Content), bytes.TrimSpace(b.Content)) + } + sort.Sort(byName(a.Children)) + sort.Sort(byName(b.Children)) + for i := range a.Children { + if !equal(&a.Children[i], &b.Children[i], depth+1) { + return false + } + } + return true +} + +func equalElement(a, b *Element) bool { + if a.Name != b.Name { + return false + } + attrs := make(map[xml.Name]string) + for _, a := range a.StartElement.Attr { + if a.Name.Space == "xmlns" || a.Name.Local == "xmlns" { + continue + } + attrs[a.Name] = a.Value + } + + for _, a := range b.StartElement.Attr { + if a.Name.Space == "xmlns" || a.Name.Local == "xmlns" { + continue + } + if v, ok := attrs[a.Name]; !ok || v != a.Value { + return false + } + } + return true +} diff --git a/element.go b/element.go new file mode 100644 index 0000000..396644d --- /dev/null +++ b/element.go @@ -0,0 +1,149 @@ +package xmltree + +import ( + "encoding/xml" + "fmt" + "log" +) + +// An Element represents a single element in an XML document. Elements +// may have zero or more children. The byte array used by the Content +// field is shared among all elements in the document, and should not +// be modified. An Element also captures xml namespace prefixes, so +// that arbitrary QNames in attribute values can be resolved. +type Element struct { + xml.StartElement + // The XML namespace scope at this element's location in the + // document. + Scope + // The raw content contained within this element's start and + // end tags. Uses the underlying byte array passed to Parse. + Content []byte + // Sub-elements contained within this element. + Children []Element +} + +// Attr gets the value of the first attribute whose name matches the +// space and local arguments. If space is the empty string, only +// attributes' local names are considered when looking for a match. +// If an attribute could not be found, the empty string is returned. +func (ele *Element) Attr(space, local string) string { + for _, v := range ele.StartElement.Attr { + if v.Name.Local != local { + continue + } + if space == "" || space == v.Name.Space { + return v.Value + } + } + return "" +} + +func (ele *Element) parse(scanner *scanner, data []byte, depth int) error { + if depth > recursionLimit { + return errDeepXML + } + ele.StartElement.Attr = ele.pushNS(ele.StartElement) + + begin := scanner.InputOffset() + end := begin +walk: + for scanner.scan() { + switch tok := scanner.tok.(type) { + case xml.StartElement: + child := Element{StartElement: tok.Copy(), Scope: ele.Scope} + if err := child.parse(scanner, data, depth+1); err != nil { + return err + } + ele.Children = append(ele.Children, child) + case xml.EndElement: + if tok.Name != ele.Name { + return fmt.Errorf("Expecting %s>, got %s>", ele.Prefix(ele.Name), ele.Prefix(tok.Name)) + } + ele.Content = data[int(begin):int(end)] + break walk + } + end = scanner.InputOffset() + } + return scanner.err +} + +// Flatten produces a slice of Element pointers referring to +// the children of el, and their children, in depth-first order. +func (ele *Element) Flatten() []*Element { + return ele.SearchFunc(func(*Element) bool { return true }) +} + +// SetAttr adds an XML attribute to an Element's existing Attributes. +// If the attribute already exists, it is replaced. +func (ele *Element) SetAttr(space, local, value string) { + for i, a := range ele.StartElement.Attr { + if a.Name.Local != local { + continue + } + if space == "" || a.Name.Space == space { + ele.StartElement.Attr[i].Value = value + return + } + } + ele.StartElement.Attr = append(ele.StartElement.Attr, xml.Attr{ + Name: xml.Name{Space: space, Local: local}, + Value: value, + }) +} + +// Search searches the Element tree for Elements with an xml tag +// matching the name and xml namespace. If space is the empty string, +// any namespace is matched. +func (ele *Element) Search(space, local string) []*Element { + return ele.SearchFunc(func(el *Element) bool { + if local != el.Name.Local { + return false + } + return space == "" || space == el.Name.Space + }) +} + +// SearchFunc traverses the Element tree in depth-first order and returns +// a slice of Elements for which the function fn returns true. +func (ele *Element) SearchFunc(fn func(*Element) bool) []*Element { + var results []*Element + var search func(el *Element) + + search = func(el *Element) { + if fn(el) { + results = append(results, el) + } + log.Print(el.walk(search)) + } + log.Print(ele.walk(search)) + return results +} + +// walkFunc is the type of the function called for each of an Element's +// children. +type walkFunc func(*Element) + +// The walk method calls the walkFunc for each of the Element's children. +// If the WalkFunc returns a non-nil error, Walk will return it +// immediately. +func (ele *Element) walk(fn walkFunc) error { + for i := 0; i < len(ele.Children); i++ { + fn(&ele.Children[i]) + } + return nil +} + +type byXMLName []xml.Name + +func (x byXMLName) Len() int { return len(x) } +func (x byXMLName) Less(i, j int) bool { + return x[i].Space+x[i].Local < x[j].Space+x[j].Local +} +func (x byXMLName) Swap(i, j int) { x[i], x[j] = x[j], x[i] } + +// String returns the XML encoding of an Element +// and its children as a string. +func (ele *Element) String() string { + return string(Marshal(ele)) +} diff --git a/example_test.go b/example_test.go new file mode 100644 index 0000000..222dfa0 --- /dev/null +++ b/example_test.go @@ -0,0 +1,228 @@ +package xmltree + +import ( + "fmt" + "log" +) + +func ExampleElement_Search() { + data := ` + + + Ira Glass + + + Tom Magliozzi + + + Terry Gross + + + ` + root, err := Parse([]byte(data)) + if err != nil { + log.Fatal(err) + } + for _, el := range root.Search("", "FullName") { + fmt.Printf("%s\n", el.Content) + } + + // Output: + // Ira Glass + // Tom Magliozzi + // Terry Gross +} + +func ExampleElement_SearchFunc() { + data := ` + + + Grace R. Emlin + + gre@example.com + + + gre@work.com + + + + Michael P. Thompson + + michaelp@example.com + + + michaelp@work.com + michael.thompson@work.com + + + + ` + + root, err := Parse([]byte(data)) + if err != nil { + log.Fatal(err) + } + + workEmails := root.SearchFunc(func(el *Element) bool { + return el.Name.Local == "Email" && el.Attr("", "where") == "work" + }) + + for _, el := range workEmails { + for _, addr := range el.Children { + fmt.Printf("%s\n", addr.Content) + } + } + + // Output: + // gre@work.com + // michaelp@work.com + // michael.thompson@work.com +} + +func ExampleUnmarshal() { + var input = []byte(` + + Page title + edit=sysop:move=sysop + + 2001-01-15T13:15:00Z + Foobar + I have just one thing to say! + A bunch of [[text]] here. + + + + 2001-01-15T13:10:27Z + 10.0.0.2 + new! + An earlier [[revision]]. + + + + + Talk:Page title + + 2001-01-15T14:03:00Z + 10.0.0.2 + hey + WHYD YOU LOCK PAGE??!!! i was editing that jerk + + + `) + + type revision struct { + Timestamp string `xml:"timestamp"` + Contributor string `xml:"contributor>ip"` + Comment string `xml:"comment"` + Text []string `xml:"text"` + } + + root, err := Parse(input) + if err != nil { + log.Fatal(err) + } + + // Pull all items from the input + for _, el := range root.Search("", "revision") { + var rev revision + if err := Unmarshal(el, &rev); err != nil { + log.Print(err) + continue + } + fmt.Println(rev.Timestamp, rev.Comment) + } + + // Output: + // 2001-01-15T13:15:00Z I have just one thing to say! + // 2001-01-15T13:10:27Z new! + // 2001-01-15T14:03:00Z hey +} + +func ExampleMarshal() { + var input = []byte(` + + + + Civilizing Huck.Miss Watson.Tom Sawyer Waits. + 1 + + + The Boys Escape Jim.Torn Sawyer's Gang.Deep-laid Plans. + 2 + + + A Good Going-over.Grace Triumphant."One of Tom Sawyers's Lies". + 3 + + + Huck and the Judge.Superstition. + 4 + + + `) + + var chapters []Element + root, err := Parse(input) + if err != nil { + log.Fatal(err) + } + + for _, el := range root.Search("", "chapter") { + for _, child := range el.Search("", "title") { + el.Content = child.Content + } + el.Children = nil + chapters = append(chapters, *el) + } + root.Children = chapters + fmt.Printf("%s\n", MarshalIndent(root, "", " ")) + + // Output: + // + // Civilizing Huck.Miss Watson.Tom Sawyer Waits. + // The Boys Escape Jim.Torn Sawyer's Gang.Deep-laid Plans. + // A Good Going-over.Grace Triumphant."One of Tom Sawyers's Lies". + // Huck and the Judge.Superstition. + // +} + +func ExampleMarshalNested() { + var input = []byte(` + + + + Hello World + + 3 + + + Hello, again + + 12 + + + `) + + parsed, err := Parse(input) + if err != nil { + log.Fatal(err.Error()) + } + + fmt.Printf("%s\n", MarshalIndent(parsed, "", " ")) + + // Output: + // + // + // + // Hello World + // + // 3 + // + // + // Hello, again + // + // 12 + // + // + // +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..54f5d38 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/wedo-workflow/xmltree + +go 1.16 + +require golang.org/x/net v0.0.0-20210420210106-798c2154c571 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..0592051 --- /dev/null +++ b/go.sum @@ -0,0 +1,8 @@ +golang.org/x/net v0.0.0-20210420210106-798c2154c571 h1:Q6Bg8xzKzpFPU4Oi1sBnBTHBwlMsLeEXpu4hYBY8rAg= +golang.org/x/net v0.0.0-20210420210106-798c2154c571/go.mod h1:72T/g9IO56b78aLF+1Kcs5dz7/ng1VjMUvfKvpfy+jM= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210420072515-93ed5bcd2bfe/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/marshal.go b/marshal.go new file mode 100644 index 0000000..6e61138 --- /dev/null +++ b/marshal.go @@ -0,0 +1,166 @@ +package xmltree + +import ( + "bytes" + "encoding/xml" + "io" + "text/template" +) + +// NOTE(droyo) As of go1.5.1, the encoding/xml package does not resolve +// prefixes in attribute names. Therefore we add .Name.Space verbatim +// instead of trying to resolve it. One consequence is this is that we cannot +// rename prefixes without some work. +var tagTmpl = template.Must(template.New("Marshal XML tags").Parse( + `{{define "start" -}} + <{{.Scope.Prefix .Name -}} + {{range .StartElement.Attr}} {{$.Scope.Prefix .Name -}}="{{.Value}}"{{end -}} + {{range .NS }} xmlns{{ if .Local }}:{{ .Local }}{{end}}="{{ .Space }}"{{end -}} + {{if or .Children .Content}}>{{else}} />{{end}} + {{- end}} + + {{define "end" -}} + {{.Prefix .Name}}>{{end}}`)) + +// Marshal produces the XML encoding of an Element as a self-contained +// document. The xmltree package may adjust the declarations of XML +// namespaces if the Element has been modified, or is part of a larger scope, +// such that the document produced by Marshal is a valid XML document. +// +// The return value of Marshal will use the utf-8 encoding regardless of +// the original encoding of the source document. +func Marshal(el *Element) []byte { + var buf bytes.Buffer + if err := Encode(&buf, el); err != nil { + // bytes.Buffer.Write should never return an error + panic(err) + } + return buf.Bytes() +} + +// MarshalIndent is like Marshal, but adds line breaks for each +// successive element. Each line begins with prefix and is +// followed by zero or more copies of indent according to the +// nesting depth. +func MarshalIndent(el *Element, prefix, indent string) []byte { + var buf bytes.Buffer + enc := encoder{ + w: &buf, + prefix: prefix, + indent: indent, + pretty: true, + } + if err := enc.encode(el, nil, make(map[*Element]struct{})); err != nil { + panic(err) + } + return buf.Bytes() +} + +// Encode writes the XML encoding of the Element to w. +// Encode returns any errors encountered writing to w. +func Encode(w io.Writer, el *Element) error { + enc := encoder{w: w} + return enc.encode(el, nil, make(map[*Element]struct{})) +} + +type encoder struct { + w io.Writer + prefix, indent string + pretty bool +} + +// This could be used to print a subset of an XML document, or a document +// that has been modified. In such an event, namespace declarations must +// be "pulled" in, so they can be resolved properly. This is trickier than +// just defining everything at the top level because there may be conflicts +// introduced by the modifications. +func (e *encoder) encode(el, parent *Element, visited map[*Element]struct{}) error { + if len(visited) > recursionLimit { + // We only return I/O errors + return nil + } + if _, ok := visited[el]; ok { + // We have a cycle. Leave a comment, but no error + e.w.Write([]byte("")) + return nil + } + scope := diffScope(parent, el) + if err := e.encodeOpenTag(el, scope, len(visited)); err != nil { + return err + } + if len(el.Children) == 0 { + if len(el.Content) > 0 { + e.w.Write(el.Content) + } else { + return nil + } + } + for i := range el.Children { + visited[el] = struct{}{} + if err := e.encode(&el.Children[i], el, visited); err != nil { + return err + } + delete(visited, el) + } + if err := e.encodeCloseTag(el, len(visited)); err != nil { + return err + } + return nil +} + +// diffScope returns the Scope of the child element, minus any +// identical namespace declaration in the parent's scope. +func diffScope(parent, child *Element) Scope { + if parent == nil { // root element + return child.Scope + } + childScope := child.Scope + parentScope := parent.Scope + for len(parentScope.ns) > 0 && len(childScope.ns) > 0 { + if childScope.ns[0] == parentScope.ns[0] { + childScope.ns = childScope.ns[1:] + parentScope.ns = parentScope.ns[1:] + } else { + break + } + } + return childScope +} + +func (e *encoder) encodeOpenTag(el *Element, scope Scope, depth int) error { + if e.pretty { + for i := 0; i < depth; i++ { + io.WriteString(e.w, e.indent) + } + } + var tag = struct { + *Element + NS []xml.Name + }{Element: el, NS: scope.ns} + if err := tagTmpl.ExecuteTemplate(e.w, "start", tag); err != nil { + return err + } + if e.pretty { + if len(el.Children) > 0 || len(el.Content) == 0 { + io.WriteString(e.w, "\n") + } + } + return nil +} + +func (e *encoder) encodeCloseTag(el *Element, depth int) error { + if e.pretty { + for i := 0; i < depth; i++ { + if len(el.Children) > 0 { + io.WriteString(e.w, e.indent) + } + } + } + if err := tagTmpl.ExecuteTemplate(e.w, "end", el); err != nil { + return err + } + if e.pretty { + io.WriteString(e.w, "\n") + } + return nil +} diff --git a/scope.go b/scope.go new file mode 100644 index 0000000..8764278 --- /dev/null +++ b/scope.go @@ -0,0 +1,119 @@ +package xmltree + +import ( + "encoding/xml" + "sort" + "strings" +) + +// A Scope represents the xml namespace scope at a given position in +// the document. +type Scope struct { + ns []xml.Name +} + +// The JoinScope method joins two Scopes together. When resolving +// prefixes using the returned scope, the prefix list in the argument +// Scope is searched before that of the receiver Scope. +func (scope *Scope) JoinScope(inner *Scope) *Scope { + return &Scope{append(scope.ns, inner.ns...)} +} + +// Resolve translates an XML QName (namespace-prefixed string) to an +// xml.Name with a canonicalized namespace in its Space field. This can +// be used when working with XSD documents, which put QNames in attribute +// values. If qname does not have a prefix, the default namespace is used.If +// a namespace prefix cannot be resolved, the returned value's Space field +// will be the unresolved prefix. Use the ResolveNS function to detect when +// a namespace prefix cannot be resolved. +func (scope *Scope) Resolve(qname string) xml.Name { + name, _ := scope.ResolveNS(qname) + return name +} + +// The ResolveNS method is like Resolve, but returns false for its second +// return value if a namespace prefix cannot be resolved. +func (scope *Scope) ResolveNS(qname string) (xml.Name, bool) { + var prefix, local string + parts := strings.SplitN(qname, ":", 2) + if len(parts) == 2 { + prefix, local = parts[0], parts[1] + } else { + prefix, local = "", parts[0] + } + switch prefix { + case "xml": + return xml.Name{Space: xmlLangURI, Local: local}, true + case "xmlns": + return xml.Name{Space: xmlNamespaceURI, Local: local}, true + } + for i := len(scope.ns) - 1; i >= 0; i-- { + if scope.ns[i].Local == prefix { + return xml.Name{Space: scope.ns[i].Space, Local: local}, true + } + } + return xml.Name{Space: prefix, Local: local}, false +} + +// ResolveDefault is like Resolve, but allows for the default namespace to +// be overridden. The namespace of strings without a namespace prefix +// (known as an NCName in XML terminology) will be defaultns. +func (scope *Scope) ResolveDefault(qname, defaultns string) xml.Name { + if defaultns == "" || strings.Contains(qname, ":") { + return scope.Resolve(qname) + } + return xml.Name{Space: defaultns, Local: qname} +} + +// Prefix is the inverse of Resolve. It uses the closest prefix +// defined for a namespace to create a string of the form +// prefix:local. If the namespace cannot be found, or is the +// default namespace, an unqualified name is returned. +func (scope *Scope) Prefix(name xml.Name) (qname string) { + switch name.Space { + case "": + return name.Local + case xmlLangURI: + return "xml:" + name.Local + case xmlNamespaceURI: + return "xmlns:" + name.Local + } + for i := len(scope.ns) - 1; i >= 0; i-- { + if scope.ns[i].Space == name.Space { + if scope.ns[i].Local == "" { + // Favor default NS if there is an extra + // qualified NS declaration + qname = name.Local + } else if len(qname) == 0 { + qname = scope.ns[i].Local + ":" + name.Local + } + } + } + return qname +} + +func (scope *Scope) pushNS(tag xml.StartElement) []xml.Attr { + var ns []xml.Name + var newAttrs []xml.Attr + for _, attr := range tag.Attr { + if attr.Name.Space == "xmlns" { + ns = append(ns, xml.Name{attr.Value, attr.Name.Local}) + } else if attr.Name.Local == "xmlns" { + ns = append(ns, xml.Name{attr.Value, ""}) + } else { + newAttrs = append(newAttrs, attr) + } + } + // Within a single tag, all ns declarations are sorted. This reduces + // differences between xmlns declarations between tags when + // modifying the xml tree. + sort.Sort(byXMLName(ns)) + if len(ns) > 0 { + scope.ns = append(scope.ns, ns...) + // Ensure that future additions to the scope create + // a new backing array. This prevents the scope from + // being clobbered during parsing. + scope.ns = scope.ns[:len(scope.ns):len(scope.ns)] + } + return newAttrs +} diff --git a/testdata/iso8859-1.xsd b/testdata/iso8859-1.xsd new file mode 100644 index 0000000..0b2925f --- /dev/null +++ b/testdata/iso8859-1.xsd @@ -0,0 +1,29 @@ + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications + with XML. + + + Astrid Lindgren + Bröderna Lejonhjärta + 46.27 + 1973 + Nangijala upplever bröderna äventyr och tillsammans med andra motståndskämpar för de kampen mot den ondskefulle Tengils förtryck och draken Katla. Boken är illustrerad av Ilon Wikland och översatt till 46 språk + + + Corets, Eva + Maeve Ascendant + Fantasy + 5.95 + 2000-11-17 + After the collapse of a nanotechnology + society in England, the young survivors lay the + foundation for a new society. + + diff --git a/vendor/golang.org/x/net/AUTHORS b/vendor/golang.org/x/net/AUTHORS new file mode 100644 index 0000000..15167cd --- /dev/null +++ b/vendor/golang.org/x/net/AUTHORS @@ -0,0 +1,3 @@ +# This source code refers to The Go Authors for copyright purposes. +# The master list of authors is in the main Go distribution, +# visible at http://tip.golang.org/AUTHORS. diff --git a/vendor/golang.org/x/net/CONTRIBUTORS b/vendor/golang.org/x/net/CONTRIBUTORS new file mode 100644 index 0000000..1c4577e --- /dev/null +++ b/vendor/golang.org/x/net/CONTRIBUTORS @@ -0,0 +1,3 @@ +# This source code was written by the Go contributors. +# The master list of contributors is in the main Go distribution, +# visible at http://tip.golang.org/CONTRIBUTORS. diff --git a/vendor/golang.org/x/net/LICENSE b/vendor/golang.org/x/net/LICENSE new file mode 100644 index 0000000..6a66aea --- /dev/null +++ b/vendor/golang.org/x/net/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/golang.org/x/net/PATENTS b/vendor/golang.org/x/net/PATENTS new file mode 100644 index 0000000..7330990 --- /dev/null +++ b/vendor/golang.org/x/net/PATENTS @@ -0,0 +1,22 @@ +Additional IP Rights Grant (Patents) + +"This implementation" means the copyrightable works distributed by +Google as part of the Go project. + +Google hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) +patent license to make, have made, use, offer to sell, sell, import, +transfer and otherwise run, modify and propagate the contents of this +implementation of Go, where such license applies only to those patent +claims, both currently owned or controlled by Google and acquired in +the future, licensable by Google that are necessarily infringed by this +implementation of Go. This grant does not include claims that would be +infringed only as a consequence of further modification of this +implementation. If you or your agent or exclusive licensee institute or +order or agree to the institution of patent litigation against any +entity (including a cross-claim or counterclaim in a lawsuit) alleging +that this implementation of Go or any code incorporated within this +implementation of Go constitutes direct or contributory patent +infringement, or inducement of patent infringement, then any patent +rights granted to you under this License for this implementation of Go +shall terminate as of the date such litigation is filed. diff --git a/vendor/golang.org/x/net/html/atom/atom.go b/vendor/golang.org/x/net/html/atom/atom.go new file mode 100644 index 0000000..cd0a8ac --- /dev/null +++ b/vendor/golang.org/x/net/html/atom/atom.go @@ -0,0 +1,78 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package atom provides integer codes (also known as atoms) for a fixed set of +// frequently occurring HTML strings: tag names and attribute keys such as "p" +// and "id". +// +// Sharing an atom's name between all elements with the same tag can result in +// fewer string allocations when tokenizing and parsing HTML. Integer +// comparisons are also generally faster than string comparisons. +// +// The value of an atom's particular code is not guaranteed to stay the same +// between versions of this package. Neither is any ordering guaranteed: +// whether atom.H1 < atom.H2 may also change. The codes are not guaranteed to +// be dense. The only guarantees are that e.g. looking up "div" will yield +// atom.Div, calling atom.Div.String will return "div", and atom.Div != 0. +package atom // import "golang.org/x/net/html/atom" + +// Atom is an integer code for a string. The zero value maps to "". +type Atom uint32 + +// String returns the atom's name. +func (a Atom) String() string { + start := uint32(a >> 8) + n := uint32(a & 0xff) + if start+n > uint32(len(atomText)) { + return "" + } + return atomText[start : start+n] +} + +func (a Atom) string() string { + return atomText[a>>8 : a>>8+a&0xff] +} + +// fnv computes the FNV hash with an arbitrary starting value h. +func fnv(h uint32, s []byte) uint32 { + for i := range s { + h ^= uint32(s[i]) + h *= 16777619 + } + return h +} + +func match(s string, t []byte) bool { + for i, c := range t { + if s[i] != c { + return false + } + } + return true +} + +// Lookup returns the atom whose name is s. It returns zero if there is no +// such atom. The lookup is case sensitive. +func Lookup(s []byte) Atom { + if len(s) == 0 || len(s) > maxAtomLen { + return 0 + } + h := fnv(hash0, s) + if a := table[h&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) { + return a + } + if a := table[(h>>16)&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) { + return a + } + return 0 +} + +// String returns a string whose contents are equal to s. In that sense, it is +// equivalent to string(s) but may be more efficient. +func String(s []byte) string { + if a := Lookup(s); a != 0 { + return a.String() + } + return string(s) +} diff --git a/vendor/golang.org/x/net/html/atom/table.go b/vendor/golang.org/x/net/html/atom/table.go new file mode 100644 index 0000000..2a93886 --- /dev/null +++ b/vendor/golang.org/x/net/html/atom/table.go @@ -0,0 +1,783 @@ +// Code generated by go generate gen.go; DO NOT EDIT. + +//go:generate go run gen.go + +package atom + +const ( + A Atom = 0x1 + Abbr Atom = 0x4 + Accept Atom = 0x1a06 + AcceptCharset Atom = 0x1a0e + Accesskey Atom = 0x2c09 + Acronym Atom = 0xaa07 + Action Atom = 0x27206 + Address Atom = 0x6f307 + Align Atom = 0xb105 + Allowfullscreen Atom = 0x2080f + Allowpaymentrequest Atom = 0xc113 + Allowusermedia Atom = 0xdd0e + Alt Atom = 0xf303 + Annotation Atom = 0x1c90a + AnnotationXml Atom = 0x1c90e + Applet Atom = 0x31906 + Area Atom = 0x35604 + Article Atom = 0x3fc07 + As Atom = 0x3c02 + Aside Atom = 0x10705 + Async Atom = 0xff05 + Audio Atom = 0x11505 + Autocomplete Atom = 0x2780c + Autofocus Atom = 0x12109 + Autoplay Atom = 0x13c08 + B Atom = 0x101 + Base Atom = 0x3b04 + Basefont Atom = 0x3b08 + Bdi Atom = 0xba03 + Bdo Atom = 0x14b03 + Bgsound Atom = 0x15e07 + Big Atom = 0x17003 + Blink Atom = 0x17305 + Blockquote Atom = 0x1870a + Body Atom = 0x2804 + Br Atom = 0x202 + Button Atom = 0x19106 + Canvas Atom = 0x10306 + Caption Atom = 0x23107 + Center Atom = 0x22006 + Challenge Atom = 0x29b09 + Charset Atom = 0x2107 + Checked Atom = 0x47907 + Cite Atom = 0x19c04 + Class Atom = 0x56405 + Code Atom = 0x5c504 + Col Atom = 0x1ab03 + Colgroup Atom = 0x1ab08 + Color Atom = 0x1bf05 + Cols Atom = 0x1c404 + Colspan Atom = 0x1c407 + Command Atom = 0x1d707 + Content Atom = 0x58b07 + Contenteditable Atom = 0x58b0f + Contextmenu Atom = 0x3800b + Controls Atom = 0x1de08 + Coords Atom = 0x1ea06 + Crossorigin Atom = 0x1fb0b + Data Atom = 0x4a504 + Datalist Atom = 0x4a508 + Datetime Atom = 0x2b808 + Dd Atom = 0x2d702 + Default Atom = 0x10a07 + Defer Atom = 0x5c705 + Del Atom = 0x45203 + Desc Atom = 0x56104 + Details Atom = 0x7207 + Dfn Atom = 0x8703 + Dialog Atom = 0xbb06 + Dir Atom = 0x9303 + Dirname Atom = 0x9307 + Disabled Atom = 0x16408 + Div Atom = 0x16b03 + Dl Atom = 0x5e602 + Download Atom = 0x46308 + Draggable Atom = 0x17a09 + Dropzone Atom = 0x40508 + Dt Atom = 0x64b02 + Em Atom = 0x6e02 + Embed Atom = 0x6e05 + Enctype Atom = 0x28d07 + Face Atom = 0x21e04 + Fieldset Atom = 0x22608 + Figcaption Atom = 0x22e0a + Figure Atom = 0x24806 + Font Atom = 0x3f04 + Footer Atom = 0xf606 + For Atom = 0x25403 + ForeignObject Atom = 0x2540d + Foreignobject Atom = 0x2610d + Form Atom = 0x26e04 + Formaction Atom = 0x26e0a + Formenctype Atom = 0x2890b + Formmethod Atom = 0x2a40a + Formnovalidate Atom = 0x2ae0e + Formtarget Atom = 0x2c00a + Frame Atom = 0x8b05 + Frameset Atom = 0x8b08 + H1 Atom = 0x15c02 + H2 Atom = 0x2de02 + H3 Atom = 0x30d02 + H4 Atom = 0x34502 + H5 Atom = 0x34f02 + H6 Atom = 0x64d02 + Head Atom = 0x33104 + Header Atom = 0x33106 + Headers Atom = 0x33107 + Height Atom = 0x5206 + Hgroup Atom = 0x2ca06 + Hidden Atom = 0x2d506 + High Atom = 0x2db04 + Hr Atom = 0x15702 + Href Atom = 0x2e004 + Hreflang Atom = 0x2e008 + Html Atom = 0x5604 + HttpEquiv Atom = 0x2e80a + I Atom = 0x601 + Icon Atom = 0x58a04 + Id Atom = 0x10902 + Iframe Atom = 0x2fc06 + Image Atom = 0x30205 + Img Atom = 0x30703 + Input Atom = 0x44b05 + Inputmode Atom = 0x44b09 + Ins Atom = 0x20403 + Integrity Atom = 0x23f09 + Is Atom = 0x16502 + Isindex Atom = 0x30f07 + Ismap Atom = 0x31605 + Itemid Atom = 0x38b06 + Itemprop Atom = 0x19d08 + Itemref Atom = 0x3cd07 + Itemscope Atom = 0x67109 + Itemtype Atom = 0x31f08 + Kbd Atom = 0xb903 + Keygen Atom = 0x3206 + Keytype Atom = 0xd607 + Kind Atom = 0x17704 + Label Atom = 0x5905 + Lang Atom = 0x2e404 + Legend Atom = 0x18106 + Li Atom = 0xb202 + Link Atom = 0x17404 + List Atom = 0x4a904 + Listing Atom = 0x4a907 + Loop Atom = 0x5d04 + Low Atom = 0xc303 + Main Atom = 0x1004 + Malignmark Atom = 0xb00a + Manifest Atom = 0x6d708 + Map Atom = 0x31803 + Mark Atom = 0xb604 + Marquee Atom = 0x32707 + Math Atom = 0x32e04 + Max Atom = 0x33d03 + Maxlength Atom = 0x33d09 + Media Atom = 0xe605 + Mediagroup Atom = 0xe60a + Menu Atom = 0x38704 + Menuitem Atom = 0x38708 + Meta Atom = 0x4b804 + Meter Atom = 0x9805 + Method Atom = 0x2a806 + Mglyph Atom = 0x30806 + Mi Atom = 0x34702 + Min Atom = 0x34703 + Minlength Atom = 0x34709 + Mn Atom = 0x2b102 + Mo Atom = 0xa402 + Ms Atom = 0x67402 + Mtext Atom = 0x35105 + Multiple Atom = 0x35f08 + Muted Atom = 0x36705 + Name Atom = 0x9604 + Nav Atom = 0x1303 + Nobr Atom = 0x3704 + Noembed Atom = 0x6c07 + Noframes Atom = 0x8908 + Nomodule Atom = 0xa208 + Nonce Atom = 0x1a605 + Noscript Atom = 0x21608 + Novalidate Atom = 0x2b20a + Object Atom = 0x26806 + Ol Atom = 0x13702 + Onabort Atom = 0x19507 + Onafterprint Atom = 0x2360c + Onautocomplete Atom = 0x2760e + Onautocompleteerror Atom = 0x27613 + Onauxclick Atom = 0x61f0a + Onbeforeprint Atom = 0x69e0d + Onbeforeunload Atom = 0x6e70e + Onblur Atom = 0x56d06 + Oncancel Atom = 0x11908 + Oncanplay Atom = 0x14d09 + Oncanplaythrough Atom = 0x14d10 + Onchange Atom = 0x41b08 + Onclick Atom = 0x2f507 + Onclose Atom = 0x36c07 + Oncontextmenu Atom = 0x37e0d + Oncopy Atom = 0x39106 + Oncuechange Atom = 0x3970b + Oncut Atom = 0x3a205 + Ondblclick Atom = 0x3a70a + Ondrag Atom = 0x3b106 + Ondragend Atom = 0x3b109 + Ondragenter Atom = 0x3ba0b + Ondragexit Atom = 0x3c50a + Ondragleave Atom = 0x3df0b + Ondragover Atom = 0x3ea0a + Ondragstart Atom = 0x3f40b + Ondrop Atom = 0x40306 + Ondurationchange Atom = 0x41310 + Onemptied Atom = 0x40a09 + Onended Atom = 0x42307 + Onerror Atom = 0x42a07 + Onfocus Atom = 0x43107 + Onhashchange Atom = 0x43d0c + Oninput Atom = 0x44907 + Oninvalid Atom = 0x45509 + Onkeydown Atom = 0x45e09 + Onkeypress Atom = 0x46b0a + Onkeyup Atom = 0x48007 + Onlanguagechange Atom = 0x48d10 + Onload Atom = 0x49d06 + Onloadeddata Atom = 0x49d0c + Onloadedmetadata Atom = 0x4b010 + Onloadend Atom = 0x4c609 + Onloadstart Atom = 0x4cf0b + Onmessage Atom = 0x4da09 + Onmessageerror Atom = 0x4da0e + Onmousedown Atom = 0x4e80b + Onmouseenter Atom = 0x4f30c + Onmouseleave Atom = 0x4ff0c + Onmousemove Atom = 0x50b0b + Onmouseout Atom = 0x5160a + Onmouseover Atom = 0x5230b + Onmouseup Atom = 0x52e09 + Onmousewheel Atom = 0x53c0c + Onoffline Atom = 0x54809 + Ononline Atom = 0x55108 + Onpagehide Atom = 0x5590a + Onpageshow Atom = 0x5730a + Onpaste Atom = 0x57f07 + Onpause Atom = 0x59a07 + Onplay Atom = 0x5a406 + Onplaying Atom = 0x5a409 + Onpopstate Atom = 0x5ad0a + Onprogress Atom = 0x5b70a + Onratechange Atom = 0x5cc0c + Onrejectionhandled Atom = 0x5d812 + Onreset Atom = 0x5ea07 + Onresize Atom = 0x5f108 + Onscroll Atom = 0x60008 + Onsecuritypolicyviolation Atom = 0x60819 + Onseeked Atom = 0x62908 + Onseeking Atom = 0x63109 + Onselect Atom = 0x63a08 + Onshow Atom = 0x64406 + Onsort Atom = 0x64f06 + Onstalled Atom = 0x65909 + Onstorage Atom = 0x66209 + Onsubmit Atom = 0x66b08 + Onsuspend Atom = 0x67b09 + Ontimeupdate Atom = 0x400c + Ontoggle Atom = 0x68408 + Onunhandledrejection Atom = 0x68c14 + Onunload Atom = 0x6ab08 + Onvolumechange Atom = 0x6b30e + Onwaiting Atom = 0x6c109 + Onwheel Atom = 0x6ca07 + Open Atom = 0x1a304 + Optgroup Atom = 0x5f08 + Optimum Atom = 0x6d107 + Option Atom = 0x6e306 + Output Atom = 0x51d06 + P Atom = 0xc01 + Param Atom = 0xc05 + Pattern Atom = 0x6607 + Picture Atom = 0x7b07 + Ping Atom = 0xef04 + Placeholder Atom = 0x1310b + Plaintext Atom = 0x1b209 + Playsinline Atom = 0x1400b + Poster Atom = 0x2cf06 + Pre Atom = 0x47003 + Preload Atom = 0x48607 + Progress Atom = 0x5b908 + Prompt Atom = 0x53606 + Public Atom = 0x58606 + Q Atom = 0xcf01 + Radiogroup Atom = 0x30a + Rb Atom = 0x3a02 + Readonly Atom = 0x35708 + Referrerpolicy Atom = 0x3d10e + Rel Atom = 0x48703 + Required Atom = 0x24c08 + Reversed Atom = 0x8008 + Rows Atom = 0x9c04 + Rowspan Atom = 0x9c07 + Rp Atom = 0x23c02 + Rt Atom = 0x19a02 + Rtc Atom = 0x19a03 + Ruby Atom = 0xfb04 + S Atom = 0x2501 + Samp Atom = 0x7804 + Sandbox Atom = 0x12907 + Scope Atom = 0x67505 + Scoped Atom = 0x67506 + Script Atom = 0x21806 + Seamless Atom = 0x37108 + Section Atom = 0x56807 + Select Atom = 0x63c06 + Selected Atom = 0x63c08 + Shape Atom = 0x1e505 + Size Atom = 0x5f504 + Sizes Atom = 0x5f505 + Slot Atom = 0x1ef04 + Small Atom = 0x20605 + Sortable Atom = 0x65108 + Sorted Atom = 0x33706 + Source Atom = 0x37806 + Spacer Atom = 0x43706 + Span Atom = 0x9f04 + Spellcheck Atom = 0x4740a + Src Atom = 0x5c003 + Srcdoc Atom = 0x5c006 + Srclang Atom = 0x5f907 + Srcset Atom = 0x6f906 + Start Atom = 0x3fa05 + Step Atom = 0x58304 + Strike Atom = 0xd206 + Strong Atom = 0x6dd06 + Style Atom = 0x6ff05 + Sub Atom = 0x66d03 + Summary Atom = 0x70407 + Sup Atom = 0x70b03 + Svg Atom = 0x70e03 + System Atom = 0x71106 + Tabindex Atom = 0x4be08 + Table Atom = 0x59505 + Target Atom = 0x2c406 + Tbody Atom = 0x2705 + Td Atom = 0x9202 + Template Atom = 0x71408 + Textarea Atom = 0x35208 + Tfoot Atom = 0xf505 + Th Atom = 0x15602 + Thead Atom = 0x33005 + Time Atom = 0x4204 + Title Atom = 0x11005 + Tr Atom = 0xcc02 + Track Atom = 0x1ba05 + Translate Atom = 0x1f209 + Tt Atom = 0x6802 + Type Atom = 0xd904 + Typemustmatch Atom = 0x2900d + U Atom = 0xb01 + Ul Atom = 0xa702 + Updateviacache Atom = 0x460e + Usemap Atom = 0x59e06 + Value Atom = 0x1505 + Var Atom = 0x16d03 + Video Atom = 0x2f105 + Wbr Atom = 0x57c03 + Width Atom = 0x64905 + Workertype Atom = 0x71c0a + Wrap Atom = 0x72604 + Xmp Atom = 0x12f03 +) + +const hash0 = 0x81cdf10e + +const maxAtomLen = 25 + +var table = [1 << 9]Atom{ + 0x1: 0xe60a, // mediagroup + 0x2: 0x2e404, // lang + 0x4: 0x2c09, // accesskey + 0x5: 0x8b08, // frameset + 0x7: 0x63a08, // onselect + 0x8: 0x71106, // system + 0xa: 0x64905, // width + 0xc: 0x2890b, // formenctype + 0xd: 0x13702, // ol + 0xe: 0x3970b, // oncuechange + 0x10: 0x14b03, // bdo + 0x11: 0x11505, // audio + 0x12: 0x17a09, // draggable + 0x14: 0x2f105, // video + 0x15: 0x2b102, // mn + 0x16: 0x38704, // menu + 0x17: 0x2cf06, // poster + 0x19: 0xf606, // footer + 0x1a: 0x2a806, // method + 0x1b: 0x2b808, // datetime + 0x1c: 0x19507, // onabort + 0x1d: 0x460e, // updateviacache + 0x1e: 0xff05, // async + 0x1f: 0x49d06, // onload + 0x21: 0x11908, // oncancel + 0x22: 0x62908, // onseeked + 0x23: 0x30205, // image + 0x24: 0x5d812, // onrejectionhandled + 0x26: 0x17404, // link + 0x27: 0x51d06, // output + 0x28: 0x33104, // head + 0x29: 0x4ff0c, // onmouseleave + 0x2a: 0x57f07, // onpaste + 0x2b: 0x5a409, // onplaying + 0x2c: 0x1c407, // colspan + 0x2f: 0x1bf05, // color + 0x30: 0x5f504, // size + 0x31: 0x2e80a, // http-equiv + 0x33: 0x601, // i + 0x34: 0x5590a, // onpagehide + 0x35: 0x68c14, // onunhandledrejection + 0x37: 0x42a07, // onerror + 0x3a: 0x3b08, // basefont + 0x3f: 0x1303, // nav + 0x40: 0x17704, // kind + 0x41: 0x35708, // readonly + 0x42: 0x30806, // mglyph + 0x44: 0xb202, // li + 0x46: 0x2d506, // hidden + 0x47: 0x70e03, // svg + 0x48: 0x58304, // step + 0x49: 0x23f09, // integrity + 0x4a: 0x58606, // public + 0x4c: 0x1ab03, // col + 0x4d: 0x1870a, // blockquote + 0x4e: 0x34f02, // h5 + 0x50: 0x5b908, // progress + 0x51: 0x5f505, // sizes + 0x52: 0x34502, // h4 + 0x56: 0x33005, // thead + 0x57: 0xd607, // keytype + 0x58: 0x5b70a, // onprogress + 0x59: 0x44b09, // inputmode + 0x5a: 0x3b109, // ondragend + 0x5d: 0x3a205, // oncut + 0x5e: 0x43706, // spacer + 0x5f: 0x1ab08, // colgroup + 0x62: 0x16502, // is + 0x65: 0x3c02, // as + 0x66: 0x54809, // onoffline + 0x67: 0x33706, // sorted + 0x69: 0x48d10, // onlanguagechange + 0x6c: 0x43d0c, // onhashchange + 0x6d: 0x9604, // name + 0x6e: 0xf505, // tfoot + 0x6f: 0x56104, // desc + 0x70: 0x33d03, // max + 0x72: 0x1ea06, // coords + 0x73: 0x30d02, // h3 + 0x74: 0x6e70e, // onbeforeunload + 0x75: 0x9c04, // rows + 0x76: 0x63c06, // select + 0x77: 0x9805, // meter + 0x78: 0x38b06, // itemid + 0x79: 0x53c0c, // onmousewheel + 0x7a: 0x5c006, // srcdoc + 0x7d: 0x1ba05, // track + 0x7f: 0x31f08, // itemtype + 0x82: 0xa402, // mo + 0x83: 0x41b08, // onchange + 0x84: 0x33107, // headers + 0x85: 0x5cc0c, // onratechange + 0x86: 0x60819, // onsecuritypolicyviolation + 0x88: 0x4a508, // datalist + 0x89: 0x4e80b, // onmousedown + 0x8a: 0x1ef04, // slot + 0x8b: 0x4b010, // onloadedmetadata + 0x8c: 0x1a06, // accept + 0x8d: 0x26806, // object + 0x91: 0x6b30e, // onvolumechange + 0x92: 0x2107, // charset + 0x93: 0x27613, // onautocompleteerror + 0x94: 0xc113, // allowpaymentrequest + 0x95: 0x2804, // body + 0x96: 0x10a07, // default + 0x97: 0x63c08, // selected + 0x98: 0x21e04, // face + 0x99: 0x1e505, // shape + 0x9b: 0x68408, // ontoggle + 0x9e: 0x64b02, // dt + 0x9f: 0xb604, // mark + 0xa1: 0xb01, // u + 0xa4: 0x6ab08, // onunload + 0xa5: 0x5d04, // loop + 0xa6: 0x16408, // disabled + 0xaa: 0x42307, // onended + 0xab: 0xb00a, // malignmark + 0xad: 0x67b09, // onsuspend + 0xae: 0x35105, // mtext + 0xaf: 0x64f06, // onsort + 0xb0: 0x19d08, // itemprop + 0xb3: 0x67109, // itemscope + 0xb4: 0x17305, // blink + 0xb6: 0x3b106, // ondrag + 0xb7: 0xa702, // ul + 0xb8: 0x26e04, // form + 0xb9: 0x12907, // sandbox + 0xba: 0x8b05, // frame + 0xbb: 0x1505, // value + 0xbc: 0x66209, // onstorage + 0xbf: 0xaa07, // acronym + 0xc0: 0x19a02, // rt + 0xc2: 0x202, // br + 0xc3: 0x22608, // fieldset + 0xc4: 0x2900d, // typemustmatch + 0xc5: 0xa208, // nomodule + 0xc6: 0x6c07, // noembed + 0xc7: 0x69e0d, // onbeforeprint + 0xc8: 0x19106, // button + 0xc9: 0x2f507, // onclick + 0xca: 0x70407, // summary + 0xcd: 0xfb04, // ruby + 0xce: 0x56405, // class + 0xcf: 0x3f40b, // ondragstart + 0xd0: 0x23107, // caption + 0xd4: 0xdd0e, // allowusermedia + 0xd5: 0x4cf0b, // onloadstart + 0xd9: 0x16b03, // div + 0xda: 0x4a904, // list + 0xdb: 0x32e04, // math + 0xdc: 0x44b05, // input + 0xdf: 0x3ea0a, // ondragover + 0xe0: 0x2de02, // h2 + 0xe2: 0x1b209, // plaintext + 0xe4: 0x4f30c, // onmouseenter + 0xe7: 0x47907, // checked + 0xe8: 0x47003, // pre + 0xea: 0x35f08, // multiple + 0xeb: 0xba03, // bdi + 0xec: 0x33d09, // maxlength + 0xed: 0xcf01, // q + 0xee: 0x61f0a, // onauxclick + 0xf0: 0x57c03, // wbr + 0xf2: 0x3b04, // base + 0xf3: 0x6e306, // option + 0xf5: 0x41310, // ondurationchange + 0xf7: 0x8908, // noframes + 0xf9: 0x40508, // dropzone + 0xfb: 0x67505, // scope + 0xfc: 0x8008, // reversed + 0xfd: 0x3ba0b, // ondragenter + 0xfe: 0x3fa05, // start + 0xff: 0x12f03, // xmp + 0x100: 0x5f907, // srclang + 0x101: 0x30703, // img + 0x104: 0x101, // b + 0x105: 0x25403, // for + 0x106: 0x10705, // aside + 0x107: 0x44907, // oninput + 0x108: 0x35604, // area + 0x109: 0x2a40a, // formmethod + 0x10a: 0x72604, // wrap + 0x10c: 0x23c02, // rp + 0x10d: 0x46b0a, // onkeypress + 0x10e: 0x6802, // tt + 0x110: 0x34702, // mi + 0x111: 0x36705, // muted + 0x112: 0xf303, // alt + 0x113: 0x5c504, // code + 0x114: 0x6e02, // em + 0x115: 0x3c50a, // ondragexit + 0x117: 0x9f04, // span + 0x119: 0x6d708, // manifest + 0x11a: 0x38708, // menuitem + 0x11b: 0x58b07, // content + 0x11d: 0x6c109, // onwaiting + 0x11f: 0x4c609, // onloadend + 0x121: 0x37e0d, // oncontextmenu + 0x123: 0x56d06, // onblur + 0x124: 0x3fc07, // article + 0x125: 0x9303, // dir + 0x126: 0xef04, // ping + 0x127: 0x24c08, // required + 0x128: 0x45509, // oninvalid + 0x129: 0xb105, // align + 0x12b: 0x58a04, // icon + 0x12c: 0x64d02, // h6 + 0x12d: 0x1c404, // cols + 0x12e: 0x22e0a, // figcaption + 0x12f: 0x45e09, // onkeydown + 0x130: 0x66b08, // onsubmit + 0x131: 0x14d09, // oncanplay + 0x132: 0x70b03, // sup + 0x133: 0xc01, // p + 0x135: 0x40a09, // onemptied + 0x136: 0x39106, // oncopy + 0x137: 0x19c04, // cite + 0x138: 0x3a70a, // ondblclick + 0x13a: 0x50b0b, // onmousemove + 0x13c: 0x66d03, // sub + 0x13d: 0x48703, // rel + 0x13e: 0x5f08, // optgroup + 0x142: 0x9c07, // rowspan + 0x143: 0x37806, // source + 0x144: 0x21608, // noscript + 0x145: 0x1a304, // open + 0x146: 0x20403, // ins + 0x147: 0x2540d, // foreignObject + 0x148: 0x5ad0a, // onpopstate + 0x14a: 0x28d07, // enctype + 0x14b: 0x2760e, // onautocomplete + 0x14c: 0x35208, // textarea + 0x14e: 0x2780c, // autocomplete + 0x14f: 0x15702, // hr + 0x150: 0x1de08, // controls + 0x151: 0x10902, // id + 0x153: 0x2360c, // onafterprint + 0x155: 0x2610d, // foreignobject + 0x156: 0x32707, // marquee + 0x157: 0x59a07, // onpause + 0x158: 0x5e602, // dl + 0x159: 0x5206, // height + 0x15a: 0x34703, // min + 0x15b: 0x9307, // dirname + 0x15c: 0x1f209, // translate + 0x15d: 0x5604, // html + 0x15e: 0x34709, // minlength + 0x15f: 0x48607, // preload + 0x160: 0x71408, // template + 0x161: 0x3df0b, // ondragleave + 0x162: 0x3a02, // rb + 0x164: 0x5c003, // src + 0x165: 0x6dd06, // strong + 0x167: 0x7804, // samp + 0x168: 0x6f307, // address + 0x169: 0x55108, // ononline + 0x16b: 0x1310b, // placeholder + 0x16c: 0x2c406, // target + 0x16d: 0x20605, // small + 0x16e: 0x6ca07, // onwheel + 0x16f: 0x1c90a, // annotation + 0x170: 0x4740a, // spellcheck + 0x171: 0x7207, // details + 0x172: 0x10306, // canvas + 0x173: 0x12109, // autofocus + 0x174: 0xc05, // param + 0x176: 0x46308, // download + 0x177: 0x45203, // del + 0x178: 0x36c07, // onclose + 0x179: 0xb903, // kbd + 0x17a: 0x31906, // applet + 0x17b: 0x2e004, // href + 0x17c: 0x5f108, // onresize + 0x17e: 0x49d0c, // onloadeddata + 0x180: 0xcc02, // tr + 0x181: 0x2c00a, // formtarget + 0x182: 0x11005, // title + 0x183: 0x6ff05, // style + 0x184: 0xd206, // strike + 0x185: 0x59e06, // usemap + 0x186: 0x2fc06, // iframe + 0x187: 0x1004, // main + 0x189: 0x7b07, // picture + 0x18c: 0x31605, // ismap + 0x18e: 0x4a504, // data + 0x18f: 0x5905, // label + 0x191: 0x3d10e, // referrerpolicy + 0x192: 0x15602, // th + 0x194: 0x53606, // prompt + 0x195: 0x56807, // section + 0x197: 0x6d107, // optimum + 0x198: 0x2db04, // high + 0x199: 0x15c02, // h1 + 0x19a: 0x65909, // onstalled + 0x19b: 0x16d03, // var + 0x19c: 0x4204, // time + 0x19e: 0x67402, // ms + 0x19f: 0x33106, // header + 0x1a0: 0x4da09, // onmessage + 0x1a1: 0x1a605, // nonce + 0x1a2: 0x26e0a, // formaction + 0x1a3: 0x22006, // center + 0x1a4: 0x3704, // nobr + 0x1a5: 0x59505, // table + 0x1a6: 0x4a907, // listing + 0x1a7: 0x18106, // legend + 0x1a9: 0x29b09, // challenge + 0x1aa: 0x24806, // figure + 0x1ab: 0xe605, // media + 0x1ae: 0xd904, // type + 0x1af: 0x3f04, // font + 0x1b0: 0x4da0e, // onmessageerror + 0x1b1: 0x37108, // seamless + 0x1b2: 0x8703, // dfn + 0x1b3: 0x5c705, // defer + 0x1b4: 0xc303, // low + 0x1b5: 0x19a03, // rtc + 0x1b6: 0x5230b, // onmouseover + 0x1b7: 0x2b20a, // novalidate + 0x1b8: 0x71c0a, // workertype + 0x1ba: 0x3cd07, // itemref + 0x1bd: 0x1, // a + 0x1be: 0x31803, // map + 0x1bf: 0x400c, // ontimeupdate + 0x1c0: 0x15e07, // bgsound + 0x1c1: 0x3206, // keygen + 0x1c2: 0x2705, // tbody + 0x1c5: 0x64406, // onshow + 0x1c7: 0x2501, // s + 0x1c8: 0x6607, // pattern + 0x1cc: 0x14d10, // oncanplaythrough + 0x1ce: 0x2d702, // dd + 0x1cf: 0x6f906, // srcset + 0x1d0: 0x17003, // big + 0x1d2: 0x65108, // sortable + 0x1d3: 0x48007, // onkeyup + 0x1d5: 0x5a406, // onplay + 0x1d7: 0x4b804, // meta + 0x1d8: 0x40306, // ondrop + 0x1da: 0x60008, // onscroll + 0x1db: 0x1fb0b, // crossorigin + 0x1dc: 0x5730a, // onpageshow + 0x1dd: 0x4, // abbr + 0x1de: 0x9202, // td + 0x1df: 0x58b0f, // contenteditable + 0x1e0: 0x27206, // action + 0x1e1: 0x1400b, // playsinline + 0x1e2: 0x43107, // onfocus + 0x1e3: 0x2e008, // hreflang + 0x1e5: 0x5160a, // onmouseout + 0x1e6: 0x5ea07, // onreset + 0x1e7: 0x13c08, // autoplay + 0x1e8: 0x63109, // onseeking + 0x1ea: 0x67506, // scoped + 0x1ec: 0x30a, // radiogroup + 0x1ee: 0x3800b, // contextmenu + 0x1ef: 0x52e09, // onmouseup + 0x1f1: 0x2ca06, // hgroup + 0x1f2: 0x2080f, // allowfullscreen + 0x1f3: 0x4be08, // tabindex + 0x1f6: 0x30f07, // isindex + 0x1f7: 0x1a0e, // accept-charset + 0x1f8: 0x2ae0e, // formnovalidate + 0x1fb: 0x1c90e, // annotation-xml + 0x1fc: 0x6e05, // embed + 0x1fd: 0x21806, // script + 0x1fe: 0xbb06, // dialog + 0x1ff: 0x1d707, // command +} + +const atomText = "abbradiogrouparamainavalueaccept-charsetbodyaccesskeygenobrb" + + "asefontimeupdateviacacheightmlabelooptgroupatternoembedetail" + + "sampictureversedfnoframesetdirnameterowspanomoduleacronymali" + + "gnmarkbdialogallowpaymentrequestrikeytypeallowusermediagroup" + + "ingaltfooterubyasyncanvasidefaultitleaudioncancelautofocusan" + + "dboxmplaceholderautoplaysinlinebdoncanplaythrough1bgsoundisa" + + "bledivarbigblinkindraggablegendblockquotebuttonabortcitempro" + + "penoncecolgrouplaintextrackcolorcolspannotation-xmlcommandco" + + "ntrolshapecoordslotranslatecrossoriginsmallowfullscreenoscri" + + "ptfacenterfieldsetfigcaptionafterprintegrityfigurequiredfore" + + "ignObjectforeignobjectformactionautocompleteerrorformenctype" + + "mustmatchallengeformmethodformnovalidatetimeformtargethgroup" + + "osterhiddenhigh2hreflanghttp-equivideonclickiframeimageimgly" + + "ph3isindexismappletitemtypemarqueematheadersortedmaxlength4m" + + "inlength5mtextareadonlymultiplemutedoncloseamlessourceoncont" + + "extmenuitemidoncopyoncuechangeoncutondblclickondragendondrag" + + "enterondragexitemreferrerpolicyondragleaveondragoverondragst" + + "articleondropzonemptiedondurationchangeonendedonerroronfocus" + + "paceronhashchangeoninputmodeloninvalidonkeydownloadonkeypres" + + "spellcheckedonkeyupreloadonlanguagechangeonloadeddatalisting" + + "onloadedmetadatabindexonloadendonloadstartonmessageerroronmo" + + "usedownonmouseenteronmouseleaveonmousemoveonmouseoutputonmou" + + "seoveronmouseupromptonmousewheelonofflineononlineonpagehides" + + "classectionbluronpageshowbronpastepublicontenteditableonpaus" + + "emaponplayingonpopstateonprogressrcdocodeferonratechangeonre" + + "jectionhandledonresetonresizesrclangonscrollonsecuritypolicy" + + "violationauxclickonseekedonseekingonselectedonshowidth6onsor" + + "tableonstalledonstorageonsubmitemscopedonsuspendontoggleonun" + + "handledrejectionbeforeprintonunloadonvolumechangeonwaitingon" + + "wheeloptimumanifestrongoptionbeforeunloaddressrcsetstylesumm" + + "arysupsvgsystemplateworkertypewrap" diff --git a/vendor/golang.org/x/net/html/charset/charset.go b/vendor/golang.org/x/net/html/charset/charset.go new file mode 100644 index 0000000..13bed15 --- /dev/null +++ b/vendor/golang.org/x/net/html/charset/charset.go @@ -0,0 +1,257 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package charset provides common text encodings for HTML documents. +// +// The mapping from encoding labels to encodings is defined at +// https://encoding.spec.whatwg.org/. +package charset // import "golang.org/x/net/html/charset" + +import ( + "bytes" + "fmt" + "io" + "mime" + "strings" + "unicode/utf8" + + "golang.org/x/net/html" + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/charmap" + "golang.org/x/text/encoding/htmlindex" + "golang.org/x/text/transform" +) + +// Lookup returns the encoding with the specified label, and its canonical +// name. It returns nil and the empty string if label is not one of the +// standard encodings for HTML. Matching is case-insensitive and ignores +// leading and trailing whitespace. Encoders will use HTML escape sequences for +// runes that are not supported by the character set. +func Lookup(label string) (e encoding.Encoding, name string) { + e, err := htmlindex.Get(label) + if err != nil { + return nil, "" + } + name, _ = htmlindex.Name(e) + return &htmlEncoding{e}, name +} + +type htmlEncoding struct{ encoding.Encoding } + +func (h *htmlEncoding) NewEncoder() *encoding.Encoder { + // HTML requires a non-terminating legacy encoder. We use HTML escapes to + // substitute unsupported code points. + return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder()) +} + +// DetermineEncoding determines the encoding of an HTML document by examining +// up to the first 1024 bytes of content and the declared Content-Type. +// +// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding +func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) { + if len(content) > 1024 { + content = content[:1024] + } + + for _, b := range boms { + if bytes.HasPrefix(content, b.bom) { + e, name = Lookup(b.enc) + return e, name, true + } + } + + if _, params, err := mime.ParseMediaType(contentType); err == nil { + if cs, ok := params["charset"]; ok { + if e, name = Lookup(cs); e != nil { + return e, name, true + } + } + } + + if len(content) > 0 { + e, name = prescan(content) + if e != nil { + return e, name, false + } + } + + // Try to detect UTF-8. + // First eliminate any partial rune at the end. + for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- { + b := content[i] + if b < 0x80 { + break + } + if utf8.RuneStart(b) { + content = content[:i] + break + } + } + hasHighBit := false + for _, c := range content { + if c >= 0x80 { + hasHighBit = true + break + } + } + if hasHighBit && utf8.Valid(content) { + return encoding.Nop, "utf-8", false + } + + // TODO: change default depending on user's locale? + return charmap.Windows1252, "windows-1252", false +} + +// NewReader returns an io.Reader that converts the content of r to UTF-8. +// It calls DetermineEncoding to find out what r's encoding is. +func NewReader(r io.Reader, contentType string) (io.Reader, error) { + preview := make([]byte, 1024) + n, err := io.ReadFull(r, preview) + switch { + case err == io.ErrUnexpectedEOF: + preview = preview[:n] + r = bytes.NewReader(preview) + case err != nil: + return nil, err + default: + r = io.MultiReader(bytes.NewReader(preview), r) + } + + if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop { + r = transform.NewReader(r, e.NewDecoder()) + } + return r, nil +} + +// NewReaderLabel returns a reader that converts from the specified charset to +// UTF-8. It uses Lookup to find the encoding that corresponds to label, and +// returns an error if Lookup returns nil. It is suitable for use as +// encoding/xml.Decoder's CharsetReader function. +func NewReaderLabel(label string, input io.Reader) (io.Reader, error) { + e, _ := Lookup(label) + if e == nil { + return nil, fmt.Errorf("unsupported charset: %q", label) + } + return transform.NewReader(input, e.NewDecoder()), nil +} + +func prescan(content []byte) (e encoding.Encoding, name string) { + z := html.NewTokenizer(bytes.NewReader(content)) + for { + switch z.Next() { + case html.ErrorToken: + return nil, "" + + case html.StartTagToken, html.SelfClosingTagToken: + tagName, hasAttr := z.TagName() + if !bytes.Equal(tagName, []byte("meta")) { + continue + } + attrList := make(map[string]bool) + gotPragma := false + + const ( + dontKnow = iota + doNeedPragma + doNotNeedPragma + ) + needPragma := dontKnow + + name = "" + e = nil + for hasAttr { + var key, val []byte + key, val, hasAttr = z.TagAttr() + ks := string(key) + if attrList[ks] { + continue + } + attrList[ks] = true + for i, c := range val { + if 'A' <= c && c <= 'Z' { + val[i] = c + 0x20 + } + } + + switch ks { + case "http-equiv": + if bytes.Equal(val, []byte("content-type")) { + gotPragma = true + } + + case "content": + if e == nil { + name = fromMetaElement(string(val)) + if name != "" { + e, name = Lookup(name) + if e != nil { + needPragma = doNeedPragma + } + } + } + + case "charset": + e, name = Lookup(string(val)) + needPragma = doNotNeedPragma + } + } + + if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma { + continue + } + + if strings.HasPrefix(name, "utf-16") { + name = "utf-8" + e = encoding.Nop + } + + if e != nil { + return e, name + } + } + } +} + +func fromMetaElement(s string) string { + for s != "" { + csLoc := strings.Index(s, "charset") + if csLoc == -1 { + return "" + } + s = s[csLoc+len("charset"):] + s = strings.TrimLeft(s, " \t\n\f\r") + if !strings.HasPrefix(s, "=") { + continue + } + s = s[1:] + s = strings.TrimLeft(s, " \t\n\f\r") + if s == "" { + return "" + } + if q := s[0]; q == '"' || q == '\'' { + s = s[1:] + closeQuote := strings.IndexRune(s, rune(q)) + if closeQuote == -1 { + return "" + } + return s[:closeQuote] + } + + end := strings.IndexAny(s, "; \t\n\f\r") + if end == -1 { + end = len(s) + } + return s[:end] + } + return "" +} + +var boms = []struct { + bom []byte + enc string +}{ + {[]byte{0xfe, 0xff}, "utf-16be"}, + {[]byte{0xff, 0xfe}, "utf-16le"}, + {[]byte{0xef, 0xbb, 0xbf}, "utf-8"}, +} diff --git a/vendor/golang.org/x/net/html/const.go b/vendor/golang.org/x/net/html/const.go new file mode 100644 index 0000000..ff7acf2 --- /dev/null +++ b/vendor/golang.org/x/net/html/const.go @@ -0,0 +1,111 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +// Section 12.2.4.2 of the HTML5 specification says "The following elements +// have varying levels of special parsing rules". +// https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements +var isSpecialElementMap = map[string]bool{ + "address": true, + "applet": true, + "area": true, + "article": true, + "aside": true, + "base": true, + "basefont": true, + "bgsound": true, + "blockquote": true, + "body": true, + "br": true, + "button": true, + "caption": true, + "center": true, + "col": true, + "colgroup": true, + "dd": true, + "details": true, + "dir": true, + "div": true, + "dl": true, + "dt": true, + "embed": true, + "fieldset": true, + "figcaption": true, + "figure": true, + "footer": true, + "form": true, + "frame": true, + "frameset": true, + "h1": true, + "h2": true, + "h3": true, + "h4": true, + "h5": true, + "h6": true, + "head": true, + "header": true, + "hgroup": true, + "hr": true, + "html": true, + "iframe": true, + "img": true, + "input": true, + "keygen": true, // "keygen" has been removed from the spec, but are kept here for backwards compatibility. + "li": true, + "link": true, + "listing": true, + "main": true, + "marquee": true, + "menu": true, + "meta": true, + "nav": true, + "noembed": true, + "noframes": true, + "noscript": true, + "object": true, + "ol": true, + "p": true, + "param": true, + "plaintext": true, + "pre": true, + "script": true, + "section": true, + "select": true, + "source": true, + "style": true, + "summary": true, + "table": true, + "tbody": true, + "td": true, + "template": true, + "textarea": true, + "tfoot": true, + "th": true, + "thead": true, + "title": true, + "tr": true, + "track": true, + "ul": true, + "wbr": true, + "xmp": true, +} + +func isSpecialElement(element *Node) bool { + switch element.Namespace { + case "", "html": + return isSpecialElementMap[element.Data] + case "math": + switch element.Data { + case "mi", "mo", "mn", "ms", "mtext", "annotation-xml": + return true + } + case "svg": + switch element.Data { + case "foreignObject", "desc", "title": + return true + } + } + return false +} diff --git a/vendor/golang.org/x/net/html/doc.go b/vendor/golang.org/x/net/html/doc.go new file mode 100644 index 0000000..822ed42 --- /dev/null +++ b/vendor/golang.org/x/net/html/doc.go @@ -0,0 +1,106 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +Package html implements an HTML5-compliant tokenizer and parser. + +Tokenization is done by creating a Tokenizer for an io.Reader r. It is the +caller's responsibility to ensure that r provides UTF-8 encoded HTML. + + z := html.NewTokenizer(r) + +Given a Tokenizer z, the HTML is tokenized by repeatedly calling z.Next(), +which parses the next token and returns its type, or an error: + + for { + tt := z.Next() + if tt == html.ErrorToken { + // ... + return ... + } + // Process the current token. + } + +There are two APIs for retrieving the current token. The high-level API is to +call Token; the low-level API is to call Text or TagName / TagAttr. Both APIs +allow optionally calling Raw after Next but before Token, Text, TagName, or +TagAttr. In EBNF notation, the valid call sequence per token is: + + Next {Raw} [ Token | Text | TagName {TagAttr} ] + +Token returns an independent data structure that completely describes a token. +Entities (such as "<") are unescaped, tag names and attribute keys are +lower-cased, and attributes are collected into a []Attribute. For example: + + for { + if z.Next() == html.ErrorToken { + // Returning io.EOF indicates success. + return z.Err() + } + emitToken(z.Token()) + } + +The low-level API performs fewer allocations and copies, but the contents of +the []byte values returned by Text, TagName and TagAttr may change on the next +call to Next. For example, to extract an HTML page's anchor text: + + depth := 0 + for { + tt := z.Next() + switch tt { + case html.ErrorToken: + return z.Err() + case html.TextToken: + if depth > 0 { + // emitBytes should copy the []byte it receives, + // if it doesn't process it immediately. + emitBytes(z.Text()) + } + case html.StartTagToken, html.EndTagToken: + tn, _ := z.TagName() + if len(tn) == 1 && tn[0] == 'a' { + if tt == html.StartTagToken { + depth++ + } else { + depth-- + } + } + } + } + +Parsing is done by calling Parse with an io.Reader, which returns the root of +the parse tree (the document element) as a *Node. It is the caller's +responsibility to ensure that the Reader provides UTF-8 encoded HTML. For +example, to process each anchor node in depth-first order: + + doc, err := html.Parse(r) + if err != nil { + // ... + } + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "a" { + // Do something with n... + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(doc) + +The relevant specifications include: +https://html.spec.whatwg.org/multipage/syntax.html and +https://html.spec.whatwg.org/multipage/syntax.html#tokenization +*/ +package html // import "golang.org/x/net/html" + +// The tokenization algorithm implemented by this package is not a line-by-line +// transliteration of the relatively verbose state-machine in the WHATWG +// specification. A more direct approach is used instead, where the program +// counter implies the state, such as whether it is tokenizing a tag or a text +// node. Specification compliance is verified by checking expected and actual +// outputs over a test suite rather than aiming for algorithmic fidelity. + +// TODO(nigeltao): Does a DOM API belong in this package or a separate one? +// TODO(nigeltao): How does parsing interact with a JavaScript engine? diff --git a/vendor/golang.org/x/net/html/doctype.go b/vendor/golang.org/x/net/html/doctype.go new file mode 100644 index 0000000..c484e5a --- /dev/null +++ b/vendor/golang.org/x/net/html/doctype.go @@ -0,0 +1,156 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "strings" +) + +// parseDoctype parses the data from a DoctypeToken into a name, +// public identifier, and system identifier. It returns a Node whose Type +// is DoctypeNode, whose Data is the name, and which has attributes +// named "system" and "public" for the two identifiers if they were present. +// quirks is whether the document should be parsed in "quirks mode". +func parseDoctype(s string) (n *Node, quirks bool) { + n = &Node{Type: DoctypeNode} + + // Find the name. + space := strings.IndexAny(s, whitespace) + if space == -1 { + space = len(s) + } + n.Data = s[:space] + // The comparison to "html" is case-sensitive. + if n.Data != "html" { + quirks = true + } + n.Data = strings.ToLower(n.Data) + s = strings.TrimLeft(s[space:], whitespace) + + if len(s) < 6 { + // It can't start with "PUBLIC" or "SYSTEM". + // Ignore the rest of the string. + return n, quirks || s != "" + } + + key := strings.ToLower(s[:6]) + s = s[6:] + for key == "public" || key == "system" { + s = strings.TrimLeft(s, whitespace) + if s == "" { + break + } + quote := s[0] + if quote != '"' && quote != '\'' { + break + } + s = s[1:] + q := strings.IndexRune(s, rune(quote)) + var id string + if q == -1 { + id = s + s = "" + } else { + id = s[:q] + s = s[q+1:] + } + n.Attr = append(n.Attr, Attribute{Key: key, Val: id}) + if key == "public" { + key = "system" + } else { + key = "" + } + } + + if key != "" || s != "" { + quirks = true + } else if len(n.Attr) > 0 { + if n.Attr[0].Key == "public" { + public := strings.ToLower(n.Attr[0].Val) + switch public { + case "-//w3o//dtd w3 html strict 3.0//en//", "-/w3d/dtd html 4.0 transitional/en", "html": + quirks = true + default: + for _, q := range quirkyIDs { + if strings.HasPrefix(public, q) { + quirks = true + break + } + } + } + // The following two public IDs only cause quirks mode if there is no system ID. + if len(n.Attr) == 1 && (strings.HasPrefix(public, "-//w3c//dtd html 4.01 frameset//") || + strings.HasPrefix(public, "-//w3c//dtd html 4.01 transitional//")) { + quirks = true + } + } + if lastAttr := n.Attr[len(n.Attr)-1]; lastAttr.Key == "system" && + strings.ToLower(lastAttr.Val) == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" { + quirks = true + } + } + + return n, quirks +} + +// quirkyIDs is a list of public doctype identifiers that cause a document +// to be interpreted in quirks mode. The identifiers should be in lower case. +var quirkyIDs = []string{ + "+//silmaril//dtd html pro v0r11 19970101//", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", + "-//as//dtd html 3.0 aswedit + extensions//", + "-//ietf//dtd html 2.0 level 1//", + "-//ietf//dtd html 2.0 level 2//", + "-//ietf//dtd html 2.0 strict level 1//", + "-//ietf//dtd html 2.0 strict level 2//", + "-//ietf//dtd html 2.0 strict//", + "-//ietf//dtd html 2.0//", + "-//ietf//dtd html 2.1e//", + "-//ietf//dtd html 3.0//", + "-//ietf//dtd html 3.2 final//", + "-//ietf//dtd html 3.2//", + "-//ietf//dtd html 3//", + "-//ietf//dtd html level 0//", + "-//ietf//dtd html level 1//", + "-//ietf//dtd html level 2//", + "-//ietf//dtd html level 3//", + "-//ietf//dtd html strict level 0//", + "-//ietf//dtd html strict level 1//", + "-//ietf//dtd html strict level 2//", + "-//ietf//dtd html strict level 3//", + "-//ietf//dtd html strict//", + "-//ietf//dtd html//", + "-//metrius//dtd metrius presentational//", + "-//microsoft//dtd internet explorer 2.0 html strict//", + "-//microsoft//dtd internet explorer 2.0 html//", + "-//microsoft//dtd internet explorer 2.0 tables//", + "-//microsoft//dtd internet explorer 3.0 html strict//", + "-//microsoft//dtd internet explorer 3.0 html//", + "-//microsoft//dtd internet explorer 3.0 tables//", + "-//netscape comm. corp.//dtd html//", + "-//netscape comm. corp.//dtd strict html//", + "-//o'reilly and associates//dtd html 2.0//", + "-//o'reilly and associates//dtd html extended 1.0//", + "-//o'reilly and associates//dtd html extended relaxed 1.0//", + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", + "-//spyglass//dtd html 2.0 extended//", + "-//sq//dtd html 2.0 hotmetal + extensions//", + "-//sun microsystems corp.//dtd hotjava html//", + "-//sun microsystems corp.//dtd hotjava strict html//", + "-//w3c//dtd html 3 1995-03-24//", + "-//w3c//dtd html 3.2 draft//", + "-//w3c//dtd html 3.2 final//", + "-//w3c//dtd html 3.2//", + "-//w3c//dtd html 3.2s draft//", + "-//w3c//dtd html 4.0 frameset//", + "-//w3c//dtd html 4.0 transitional//", + "-//w3c//dtd html experimental 19960712//", + "-//w3c//dtd html experimental 970421//", + "-//w3c//dtd w3 html//", + "-//w3o//dtd w3 html 3.0//", + "-//webtechs//dtd mozilla html 2.0//", + "-//webtechs//dtd mozilla html//", +} diff --git a/vendor/golang.org/x/net/html/entity.go b/vendor/golang.org/x/net/html/entity.go new file mode 100644 index 0000000..b628880 --- /dev/null +++ b/vendor/golang.org/x/net/html/entity.go @@ -0,0 +1,2253 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +// All entities that do not end with ';' are 6 or fewer bytes long. +const longestEntityWithoutSemicolon = 6 + +// entity is a map from HTML entity names to their values. The semicolon matters: +// https://html.spec.whatwg.org/multipage/syntax.html#named-character-references +// lists both "amp" and "amp;" as two separate entries. +// +// Note that the HTML5 list is larger than the HTML4 list at +// http://www.w3.org/TR/html4/sgml/entities.html +var entity = map[string]rune{ + "AElig;": '\U000000C6', + "AMP;": '\U00000026', + "Aacute;": '\U000000C1', + "Abreve;": '\U00000102', + "Acirc;": '\U000000C2', + "Acy;": '\U00000410', + "Afr;": '\U0001D504', + "Agrave;": '\U000000C0', + "Alpha;": '\U00000391', + "Amacr;": '\U00000100', + "And;": '\U00002A53', + "Aogon;": '\U00000104', + "Aopf;": '\U0001D538', + "ApplyFunction;": '\U00002061', + "Aring;": '\U000000C5', + "Ascr;": '\U0001D49C', + "Assign;": '\U00002254', + "Atilde;": '\U000000C3', + "Auml;": '\U000000C4', + "Backslash;": '\U00002216', + "Barv;": '\U00002AE7', + "Barwed;": '\U00002306', + "Bcy;": '\U00000411', + "Because;": '\U00002235', + "Bernoullis;": '\U0000212C', + "Beta;": '\U00000392', + "Bfr;": '\U0001D505', + "Bopf;": '\U0001D539', + "Breve;": '\U000002D8', + "Bscr;": '\U0000212C', + "Bumpeq;": '\U0000224E', + "CHcy;": '\U00000427', + "COPY;": '\U000000A9', + "Cacute;": '\U00000106', + "Cap;": '\U000022D2', + "CapitalDifferentialD;": '\U00002145', + "Cayleys;": '\U0000212D', + "Ccaron;": '\U0000010C', + "Ccedil;": '\U000000C7', + "Ccirc;": '\U00000108', + "Cconint;": '\U00002230', + "Cdot;": '\U0000010A', + "Cedilla;": '\U000000B8', + "CenterDot;": '\U000000B7', + "Cfr;": '\U0000212D', + "Chi;": '\U000003A7', + "CircleDot;": '\U00002299', + "CircleMinus;": '\U00002296', + "CirclePlus;": '\U00002295', + "CircleTimes;": '\U00002297', + "ClockwiseContourIntegral;": '\U00002232', + "CloseCurlyDoubleQuote;": '\U0000201D', + "CloseCurlyQuote;": '\U00002019', + "Colon;": '\U00002237', + "Colone;": '\U00002A74', + "Congruent;": '\U00002261', + "Conint;": '\U0000222F', + "ContourIntegral;": '\U0000222E', + "Copf;": '\U00002102', + "Coproduct;": '\U00002210', + "CounterClockwiseContourIntegral;": '\U00002233', + "Cross;": '\U00002A2F', + "Cscr;": '\U0001D49E', + "Cup;": '\U000022D3', + "CupCap;": '\U0000224D', + "DD;": '\U00002145', + "DDotrahd;": '\U00002911', + "DJcy;": '\U00000402', + "DScy;": '\U00000405', + "DZcy;": '\U0000040F', + "Dagger;": '\U00002021', + "Darr;": '\U000021A1', + "Dashv;": '\U00002AE4', + "Dcaron;": '\U0000010E', + "Dcy;": '\U00000414', + "Del;": '\U00002207', + "Delta;": '\U00000394', + "Dfr;": '\U0001D507', + "DiacriticalAcute;": '\U000000B4', + "DiacriticalDot;": '\U000002D9', + "DiacriticalDoubleAcute;": '\U000002DD', + "DiacriticalGrave;": '\U00000060', + "DiacriticalTilde;": '\U000002DC', + "Diamond;": '\U000022C4', + "DifferentialD;": '\U00002146', + "Dopf;": '\U0001D53B', + "Dot;": '\U000000A8', + "DotDot;": '\U000020DC', + "DotEqual;": '\U00002250', + "DoubleContourIntegral;": '\U0000222F', + "DoubleDot;": '\U000000A8', + "DoubleDownArrow;": '\U000021D3', + "DoubleLeftArrow;": '\U000021D0', + "DoubleLeftRightArrow;": '\U000021D4', + "DoubleLeftTee;": '\U00002AE4', + "DoubleLongLeftArrow;": '\U000027F8', + "DoubleLongLeftRightArrow;": '\U000027FA', + "DoubleLongRightArrow;": '\U000027F9', + "DoubleRightArrow;": '\U000021D2', + "DoubleRightTee;": '\U000022A8', + "DoubleUpArrow;": '\U000021D1', + "DoubleUpDownArrow;": '\U000021D5', + "DoubleVerticalBar;": '\U00002225', + "DownArrow;": '\U00002193', + "DownArrowBar;": '\U00002913', + "DownArrowUpArrow;": '\U000021F5', + "DownBreve;": '\U00000311', + "DownLeftRightVector;": '\U00002950', + "DownLeftTeeVector;": '\U0000295E', + "DownLeftVector;": '\U000021BD', + "DownLeftVectorBar;": '\U00002956', + "DownRightTeeVector;": '\U0000295F', + "DownRightVector;": '\U000021C1', + "DownRightVectorBar;": '\U00002957', + "DownTee;": '\U000022A4', + "DownTeeArrow;": '\U000021A7', + "Downarrow;": '\U000021D3', + "Dscr;": '\U0001D49F', + "Dstrok;": '\U00000110', + "ENG;": '\U0000014A', + "ETH;": '\U000000D0', + "Eacute;": '\U000000C9', + "Ecaron;": '\U0000011A', + "Ecirc;": '\U000000CA', + "Ecy;": '\U0000042D', + "Edot;": '\U00000116', + "Efr;": '\U0001D508', + "Egrave;": '\U000000C8', + "Element;": '\U00002208', + "Emacr;": '\U00000112', + "EmptySmallSquare;": '\U000025FB', + "EmptyVerySmallSquare;": '\U000025AB', + "Eogon;": '\U00000118', + "Eopf;": '\U0001D53C', + "Epsilon;": '\U00000395', + "Equal;": '\U00002A75', + "EqualTilde;": '\U00002242', + "Equilibrium;": '\U000021CC', + "Escr;": '\U00002130', + "Esim;": '\U00002A73', + "Eta;": '\U00000397', + "Euml;": '\U000000CB', + "Exists;": '\U00002203', + "ExponentialE;": '\U00002147', + "Fcy;": '\U00000424', + "Ffr;": '\U0001D509', + "FilledSmallSquare;": '\U000025FC', + "FilledVerySmallSquare;": '\U000025AA', + "Fopf;": '\U0001D53D', + "ForAll;": '\U00002200', + "Fouriertrf;": '\U00002131', + "Fscr;": '\U00002131', + "GJcy;": '\U00000403', + "GT;": '\U0000003E', + "Gamma;": '\U00000393', + "Gammad;": '\U000003DC', + "Gbreve;": '\U0000011E', + "Gcedil;": '\U00000122', + "Gcirc;": '\U0000011C', + "Gcy;": '\U00000413', + "Gdot;": '\U00000120', + "Gfr;": '\U0001D50A', + "Gg;": '\U000022D9', + "Gopf;": '\U0001D53E', + "GreaterEqual;": '\U00002265', + "GreaterEqualLess;": '\U000022DB', + "GreaterFullEqual;": '\U00002267', + "GreaterGreater;": '\U00002AA2', + "GreaterLess;": '\U00002277', + "GreaterSlantEqual;": '\U00002A7E', + "GreaterTilde;": '\U00002273', + "Gscr;": '\U0001D4A2', + "Gt;": '\U0000226B', + "HARDcy;": '\U0000042A', + "Hacek;": '\U000002C7', + "Hat;": '\U0000005E', + "Hcirc;": '\U00000124', + "Hfr;": '\U0000210C', + "HilbertSpace;": '\U0000210B', + "Hopf;": '\U0000210D', + "HorizontalLine;": '\U00002500', + "Hscr;": '\U0000210B', + "Hstrok;": '\U00000126', + "HumpDownHump;": '\U0000224E', + "HumpEqual;": '\U0000224F', + "IEcy;": '\U00000415', + "IJlig;": '\U00000132', + "IOcy;": '\U00000401', + "Iacute;": '\U000000CD', + "Icirc;": '\U000000CE', + "Icy;": '\U00000418', + "Idot;": '\U00000130', + "Ifr;": '\U00002111', + "Igrave;": '\U000000CC', + "Im;": '\U00002111', + "Imacr;": '\U0000012A', + "ImaginaryI;": '\U00002148', + "Implies;": '\U000021D2', + "Int;": '\U0000222C', + "Integral;": '\U0000222B', + "Intersection;": '\U000022C2', + "InvisibleComma;": '\U00002063', + "InvisibleTimes;": '\U00002062', + "Iogon;": '\U0000012E', + "Iopf;": '\U0001D540', + "Iota;": '\U00000399', + "Iscr;": '\U00002110', + "Itilde;": '\U00000128', + "Iukcy;": '\U00000406', + "Iuml;": '\U000000CF', + "Jcirc;": '\U00000134', + "Jcy;": '\U00000419', + "Jfr;": '\U0001D50D', + "Jopf;": '\U0001D541', + "Jscr;": '\U0001D4A5', + "Jsercy;": '\U00000408', + "Jukcy;": '\U00000404', + "KHcy;": '\U00000425', + "KJcy;": '\U0000040C', + "Kappa;": '\U0000039A', + "Kcedil;": '\U00000136', + "Kcy;": '\U0000041A', + "Kfr;": '\U0001D50E', + "Kopf;": '\U0001D542', + "Kscr;": '\U0001D4A6', + "LJcy;": '\U00000409', + "LT;": '\U0000003C', + "Lacute;": '\U00000139', + "Lambda;": '\U0000039B', + "Lang;": '\U000027EA', + "Laplacetrf;": '\U00002112', + "Larr;": '\U0000219E', + "Lcaron;": '\U0000013D', + "Lcedil;": '\U0000013B', + "Lcy;": '\U0000041B', + "LeftAngleBracket;": '\U000027E8', + "LeftArrow;": '\U00002190', + "LeftArrowBar;": '\U000021E4', + "LeftArrowRightArrow;": '\U000021C6', + "LeftCeiling;": '\U00002308', + "LeftDoubleBracket;": '\U000027E6', + "LeftDownTeeVector;": '\U00002961', + "LeftDownVector;": '\U000021C3', + "LeftDownVectorBar;": '\U00002959', + "LeftFloor;": '\U0000230A', + "LeftRightArrow;": '\U00002194', + "LeftRightVector;": '\U0000294E', + "LeftTee;": '\U000022A3', + "LeftTeeArrow;": '\U000021A4', + "LeftTeeVector;": '\U0000295A', + "LeftTriangle;": '\U000022B2', + "LeftTriangleBar;": '\U000029CF', + "LeftTriangleEqual;": '\U000022B4', + "LeftUpDownVector;": '\U00002951', + "LeftUpTeeVector;": '\U00002960', + "LeftUpVector;": '\U000021BF', + "LeftUpVectorBar;": '\U00002958', + "LeftVector;": '\U000021BC', + "LeftVectorBar;": '\U00002952', + "Leftarrow;": '\U000021D0', + "Leftrightarrow;": '\U000021D4', + "LessEqualGreater;": '\U000022DA', + "LessFullEqual;": '\U00002266', + "LessGreater;": '\U00002276', + "LessLess;": '\U00002AA1', + "LessSlantEqual;": '\U00002A7D', + "LessTilde;": '\U00002272', + "Lfr;": '\U0001D50F', + "Ll;": '\U000022D8', + "Lleftarrow;": '\U000021DA', + "Lmidot;": '\U0000013F', + "LongLeftArrow;": '\U000027F5', + "LongLeftRightArrow;": '\U000027F7', + "LongRightArrow;": '\U000027F6', + "Longleftarrow;": '\U000027F8', + "Longleftrightarrow;": '\U000027FA', + "Longrightarrow;": '\U000027F9', + "Lopf;": '\U0001D543', + "LowerLeftArrow;": '\U00002199', + "LowerRightArrow;": '\U00002198', + "Lscr;": '\U00002112', + "Lsh;": '\U000021B0', + "Lstrok;": '\U00000141', + "Lt;": '\U0000226A', + "Map;": '\U00002905', + "Mcy;": '\U0000041C', + "MediumSpace;": '\U0000205F', + "Mellintrf;": '\U00002133', + "Mfr;": '\U0001D510', + "MinusPlus;": '\U00002213', + "Mopf;": '\U0001D544', + "Mscr;": '\U00002133', + "Mu;": '\U0000039C', + "NJcy;": '\U0000040A', + "Nacute;": '\U00000143', + "Ncaron;": '\U00000147', + "Ncedil;": '\U00000145', + "Ncy;": '\U0000041D', + "NegativeMediumSpace;": '\U0000200B', + "NegativeThickSpace;": '\U0000200B', + "NegativeThinSpace;": '\U0000200B', + "NegativeVeryThinSpace;": '\U0000200B', + "NestedGreaterGreater;": '\U0000226B', + "NestedLessLess;": '\U0000226A', + "NewLine;": '\U0000000A', + "Nfr;": '\U0001D511', + "NoBreak;": '\U00002060', + "NonBreakingSpace;": '\U000000A0', + "Nopf;": '\U00002115', + "Not;": '\U00002AEC', + "NotCongruent;": '\U00002262', + "NotCupCap;": '\U0000226D', + "NotDoubleVerticalBar;": '\U00002226', + "NotElement;": '\U00002209', + "NotEqual;": '\U00002260', + "NotExists;": '\U00002204', + "NotGreater;": '\U0000226F', + "NotGreaterEqual;": '\U00002271', + "NotGreaterLess;": '\U00002279', + "NotGreaterTilde;": '\U00002275', + "NotLeftTriangle;": '\U000022EA', + "NotLeftTriangleEqual;": '\U000022EC', + "NotLess;": '\U0000226E', + "NotLessEqual;": '\U00002270', + "NotLessGreater;": '\U00002278', + "NotLessTilde;": '\U00002274', + "NotPrecedes;": '\U00002280', + "NotPrecedesSlantEqual;": '\U000022E0', + "NotReverseElement;": '\U0000220C', + "NotRightTriangle;": '\U000022EB', + "NotRightTriangleEqual;": '\U000022ED', + "NotSquareSubsetEqual;": '\U000022E2', + "NotSquareSupersetEqual;": '\U000022E3', + "NotSubsetEqual;": '\U00002288', + "NotSucceeds;": '\U00002281', + "NotSucceedsSlantEqual;": '\U000022E1', + "NotSupersetEqual;": '\U00002289', + "NotTilde;": '\U00002241', + "NotTildeEqual;": '\U00002244', + "NotTildeFullEqual;": '\U00002247', + "NotTildeTilde;": '\U00002249', + "NotVerticalBar;": '\U00002224', + "Nscr;": '\U0001D4A9', + "Ntilde;": '\U000000D1', + "Nu;": '\U0000039D', + "OElig;": '\U00000152', + "Oacute;": '\U000000D3', + "Ocirc;": '\U000000D4', + "Ocy;": '\U0000041E', + "Odblac;": '\U00000150', + "Ofr;": '\U0001D512', + "Ograve;": '\U000000D2', + "Omacr;": '\U0000014C', + "Omega;": '\U000003A9', + "Omicron;": '\U0000039F', + "Oopf;": '\U0001D546', + "OpenCurlyDoubleQuote;": '\U0000201C', + "OpenCurlyQuote;": '\U00002018', + "Or;": '\U00002A54', + "Oscr;": '\U0001D4AA', + "Oslash;": '\U000000D8', + "Otilde;": '\U000000D5', + "Otimes;": '\U00002A37', + "Ouml;": '\U000000D6', + "OverBar;": '\U0000203E', + "OverBrace;": '\U000023DE', + "OverBracket;": '\U000023B4', + "OverParenthesis;": '\U000023DC', + "PartialD;": '\U00002202', + "Pcy;": '\U0000041F', + "Pfr;": '\U0001D513', + "Phi;": '\U000003A6', + "Pi;": '\U000003A0', + "PlusMinus;": '\U000000B1', + "Poincareplane;": '\U0000210C', + "Popf;": '\U00002119', + "Pr;": '\U00002ABB', + "Precedes;": '\U0000227A', + "PrecedesEqual;": '\U00002AAF', + "PrecedesSlantEqual;": '\U0000227C', + "PrecedesTilde;": '\U0000227E', + "Prime;": '\U00002033', + "Product;": '\U0000220F', + "Proportion;": '\U00002237', + "Proportional;": '\U0000221D', + "Pscr;": '\U0001D4AB', + "Psi;": '\U000003A8', + "QUOT;": '\U00000022', + "Qfr;": '\U0001D514', + "Qopf;": '\U0000211A', + "Qscr;": '\U0001D4AC', + "RBarr;": '\U00002910', + "REG;": '\U000000AE', + "Racute;": '\U00000154', + "Rang;": '\U000027EB', + "Rarr;": '\U000021A0', + "Rarrtl;": '\U00002916', + "Rcaron;": '\U00000158', + "Rcedil;": '\U00000156', + "Rcy;": '\U00000420', + "Re;": '\U0000211C', + "ReverseElement;": '\U0000220B', + "ReverseEquilibrium;": '\U000021CB', + "ReverseUpEquilibrium;": '\U0000296F', + "Rfr;": '\U0000211C', + "Rho;": '\U000003A1', + "RightAngleBracket;": '\U000027E9', + "RightArrow;": '\U00002192', + "RightArrowBar;": '\U000021E5', + "RightArrowLeftArrow;": '\U000021C4', + "RightCeiling;": '\U00002309', + "RightDoubleBracket;": '\U000027E7', + "RightDownTeeVector;": '\U0000295D', + "RightDownVector;": '\U000021C2', + "RightDownVectorBar;": '\U00002955', + "RightFloor;": '\U0000230B', + "RightTee;": '\U000022A2', + "RightTeeArrow;": '\U000021A6', + "RightTeeVector;": '\U0000295B', + "RightTriangle;": '\U000022B3', + "RightTriangleBar;": '\U000029D0', + "RightTriangleEqual;": '\U000022B5', + "RightUpDownVector;": '\U0000294F', + "RightUpTeeVector;": '\U0000295C', + "RightUpVector;": '\U000021BE', + "RightUpVectorBar;": '\U00002954', + "RightVector;": '\U000021C0', + "RightVectorBar;": '\U00002953', + "Rightarrow;": '\U000021D2', + "Ropf;": '\U0000211D', + "RoundImplies;": '\U00002970', + "Rrightarrow;": '\U000021DB', + "Rscr;": '\U0000211B', + "Rsh;": '\U000021B1', + "RuleDelayed;": '\U000029F4', + "SHCHcy;": '\U00000429', + "SHcy;": '\U00000428', + "SOFTcy;": '\U0000042C', + "Sacute;": '\U0000015A', + "Sc;": '\U00002ABC', + "Scaron;": '\U00000160', + "Scedil;": '\U0000015E', + "Scirc;": '\U0000015C', + "Scy;": '\U00000421', + "Sfr;": '\U0001D516', + "ShortDownArrow;": '\U00002193', + "ShortLeftArrow;": '\U00002190', + "ShortRightArrow;": '\U00002192', + "ShortUpArrow;": '\U00002191', + "Sigma;": '\U000003A3', + "SmallCircle;": '\U00002218', + "Sopf;": '\U0001D54A', + "Sqrt;": '\U0000221A', + "Square;": '\U000025A1', + "SquareIntersection;": '\U00002293', + "SquareSubset;": '\U0000228F', + "SquareSubsetEqual;": '\U00002291', + "SquareSuperset;": '\U00002290', + "SquareSupersetEqual;": '\U00002292', + "SquareUnion;": '\U00002294', + "Sscr;": '\U0001D4AE', + "Star;": '\U000022C6', + "Sub;": '\U000022D0', + "Subset;": '\U000022D0', + "SubsetEqual;": '\U00002286', + "Succeeds;": '\U0000227B', + "SucceedsEqual;": '\U00002AB0', + "SucceedsSlantEqual;": '\U0000227D', + "SucceedsTilde;": '\U0000227F', + "SuchThat;": '\U0000220B', + "Sum;": '\U00002211', + "Sup;": '\U000022D1', + "Superset;": '\U00002283', + "SupersetEqual;": '\U00002287', + "Supset;": '\U000022D1', + "THORN;": '\U000000DE', + "TRADE;": '\U00002122', + "TSHcy;": '\U0000040B', + "TScy;": '\U00000426', + "Tab;": '\U00000009', + "Tau;": '\U000003A4', + "Tcaron;": '\U00000164', + "Tcedil;": '\U00000162', + "Tcy;": '\U00000422', + "Tfr;": '\U0001D517', + "Therefore;": '\U00002234', + "Theta;": '\U00000398', + "ThinSpace;": '\U00002009', + "Tilde;": '\U0000223C', + "TildeEqual;": '\U00002243', + "TildeFullEqual;": '\U00002245', + "TildeTilde;": '\U00002248', + "Topf;": '\U0001D54B', + "TripleDot;": '\U000020DB', + "Tscr;": '\U0001D4AF', + "Tstrok;": '\U00000166', + "Uacute;": '\U000000DA', + "Uarr;": '\U0000219F', + "Uarrocir;": '\U00002949', + "Ubrcy;": '\U0000040E', + "Ubreve;": '\U0000016C', + "Ucirc;": '\U000000DB', + "Ucy;": '\U00000423', + "Udblac;": '\U00000170', + "Ufr;": '\U0001D518', + "Ugrave;": '\U000000D9', + "Umacr;": '\U0000016A', + "UnderBar;": '\U0000005F', + "UnderBrace;": '\U000023DF', + "UnderBracket;": '\U000023B5', + "UnderParenthesis;": '\U000023DD', + "Union;": '\U000022C3', + "UnionPlus;": '\U0000228E', + "Uogon;": '\U00000172', + "Uopf;": '\U0001D54C', + "UpArrow;": '\U00002191', + "UpArrowBar;": '\U00002912', + "UpArrowDownArrow;": '\U000021C5', + "UpDownArrow;": '\U00002195', + "UpEquilibrium;": '\U0000296E', + "UpTee;": '\U000022A5', + "UpTeeArrow;": '\U000021A5', + "Uparrow;": '\U000021D1', + "Updownarrow;": '\U000021D5', + "UpperLeftArrow;": '\U00002196', + "UpperRightArrow;": '\U00002197', + "Upsi;": '\U000003D2', + "Upsilon;": '\U000003A5', + "Uring;": '\U0000016E', + "Uscr;": '\U0001D4B0', + "Utilde;": '\U00000168', + "Uuml;": '\U000000DC', + "VDash;": '\U000022AB', + "Vbar;": '\U00002AEB', + "Vcy;": '\U00000412', + "Vdash;": '\U000022A9', + "Vdashl;": '\U00002AE6', + "Vee;": '\U000022C1', + "Verbar;": '\U00002016', + "Vert;": '\U00002016', + "VerticalBar;": '\U00002223', + "VerticalLine;": '\U0000007C', + "VerticalSeparator;": '\U00002758', + "VerticalTilde;": '\U00002240', + "VeryThinSpace;": '\U0000200A', + "Vfr;": '\U0001D519', + "Vopf;": '\U0001D54D', + "Vscr;": '\U0001D4B1', + "Vvdash;": '\U000022AA', + "Wcirc;": '\U00000174', + "Wedge;": '\U000022C0', + "Wfr;": '\U0001D51A', + "Wopf;": '\U0001D54E', + "Wscr;": '\U0001D4B2', + "Xfr;": '\U0001D51B', + "Xi;": '\U0000039E', + "Xopf;": '\U0001D54F', + "Xscr;": '\U0001D4B3', + "YAcy;": '\U0000042F', + "YIcy;": '\U00000407', + "YUcy;": '\U0000042E', + "Yacute;": '\U000000DD', + "Ycirc;": '\U00000176', + "Ycy;": '\U0000042B', + "Yfr;": '\U0001D51C', + "Yopf;": '\U0001D550', + "Yscr;": '\U0001D4B4', + "Yuml;": '\U00000178', + "ZHcy;": '\U00000416', + "Zacute;": '\U00000179', + "Zcaron;": '\U0000017D', + "Zcy;": '\U00000417', + "Zdot;": '\U0000017B', + "ZeroWidthSpace;": '\U0000200B', + "Zeta;": '\U00000396', + "Zfr;": '\U00002128', + "Zopf;": '\U00002124', + "Zscr;": '\U0001D4B5', + "aacute;": '\U000000E1', + "abreve;": '\U00000103', + "ac;": '\U0000223E', + "acd;": '\U0000223F', + "acirc;": '\U000000E2', + "acute;": '\U000000B4', + "acy;": '\U00000430', + "aelig;": '\U000000E6', + "af;": '\U00002061', + "afr;": '\U0001D51E', + "agrave;": '\U000000E0', + "alefsym;": '\U00002135', + "aleph;": '\U00002135', + "alpha;": '\U000003B1', + "amacr;": '\U00000101', + "amalg;": '\U00002A3F', + "amp;": '\U00000026', + "and;": '\U00002227', + "andand;": '\U00002A55', + "andd;": '\U00002A5C', + "andslope;": '\U00002A58', + "andv;": '\U00002A5A', + "ang;": '\U00002220', + "ange;": '\U000029A4', + "angle;": '\U00002220', + "angmsd;": '\U00002221', + "angmsdaa;": '\U000029A8', + "angmsdab;": '\U000029A9', + "angmsdac;": '\U000029AA', + "angmsdad;": '\U000029AB', + "angmsdae;": '\U000029AC', + "angmsdaf;": '\U000029AD', + "angmsdag;": '\U000029AE', + "angmsdah;": '\U000029AF', + "angrt;": '\U0000221F', + "angrtvb;": '\U000022BE', + "angrtvbd;": '\U0000299D', + "angsph;": '\U00002222', + "angst;": '\U000000C5', + "angzarr;": '\U0000237C', + "aogon;": '\U00000105', + "aopf;": '\U0001D552', + "ap;": '\U00002248', + "apE;": '\U00002A70', + "apacir;": '\U00002A6F', + "ape;": '\U0000224A', + "apid;": '\U0000224B', + "apos;": '\U00000027', + "approx;": '\U00002248', + "approxeq;": '\U0000224A', + "aring;": '\U000000E5', + "ascr;": '\U0001D4B6', + "ast;": '\U0000002A', + "asymp;": '\U00002248', + "asympeq;": '\U0000224D', + "atilde;": '\U000000E3', + "auml;": '\U000000E4', + "awconint;": '\U00002233', + "awint;": '\U00002A11', + "bNot;": '\U00002AED', + "backcong;": '\U0000224C', + "backepsilon;": '\U000003F6', + "backprime;": '\U00002035', + "backsim;": '\U0000223D', + "backsimeq;": '\U000022CD', + "barvee;": '\U000022BD', + "barwed;": '\U00002305', + "barwedge;": '\U00002305', + "bbrk;": '\U000023B5', + "bbrktbrk;": '\U000023B6', + "bcong;": '\U0000224C', + "bcy;": '\U00000431', + "bdquo;": '\U0000201E', + "becaus;": '\U00002235', + "because;": '\U00002235', + "bemptyv;": '\U000029B0', + "bepsi;": '\U000003F6', + "bernou;": '\U0000212C', + "beta;": '\U000003B2', + "beth;": '\U00002136', + "between;": '\U0000226C', + "bfr;": '\U0001D51F', + "bigcap;": '\U000022C2', + "bigcirc;": '\U000025EF', + "bigcup;": '\U000022C3', + "bigodot;": '\U00002A00', + "bigoplus;": '\U00002A01', + "bigotimes;": '\U00002A02', + "bigsqcup;": '\U00002A06', + "bigstar;": '\U00002605', + "bigtriangledown;": '\U000025BD', + "bigtriangleup;": '\U000025B3', + "biguplus;": '\U00002A04', + "bigvee;": '\U000022C1', + "bigwedge;": '\U000022C0', + "bkarow;": '\U0000290D', + "blacklozenge;": '\U000029EB', + "blacksquare;": '\U000025AA', + "blacktriangle;": '\U000025B4', + "blacktriangledown;": '\U000025BE', + "blacktriangleleft;": '\U000025C2', + "blacktriangleright;": '\U000025B8', + "blank;": '\U00002423', + "blk12;": '\U00002592', + "blk14;": '\U00002591', + "blk34;": '\U00002593', + "block;": '\U00002588', + "bnot;": '\U00002310', + "bopf;": '\U0001D553', + "bot;": '\U000022A5', + "bottom;": '\U000022A5', + "bowtie;": '\U000022C8', + "boxDL;": '\U00002557', + "boxDR;": '\U00002554', + "boxDl;": '\U00002556', + "boxDr;": '\U00002553', + "boxH;": '\U00002550', + "boxHD;": '\U00002566', + "boxHU;": '\U00002569', + "boxHd;": '\U00002564', + "boxHu;": '\U00002567', + "boxUL;": '\U0000255D', + "boxUR;": '\U0000255A', + "boxUl;": '\U0000255C', + "boxUr;": '\U00002559', + "boxV;": '\U00002551', + "boxVH;": '\U0000256C', + "boxVL;": '\U00002563', + "boxVR;": '\U00002560', + "boxVh;": '\U0000256B', + "boxVl;": '\U00002562', + "boxVr;": '\U0000255F', + "boxbox;": '\U000029C9', + "boxdL;": '\U00002555', + "boxdR;": '\U00002552', + "boxdl;": '\U00002510', + "boxdr;": '\U0000250C', + "boxh;": '\U00002500', + "boxhD;": '\U00002565', + "boxhU;": '\U00002568', + "boxhd;": '\U0000252C', + "boxhu;": '\U00002534', + "boxminus;": '\U0000229F', + "boxplus;": '\U0000229E', + "boxtimes;": '\U000022A0', + "boxuL;": '\U0000255B', + "boxuR;": '\U00002558', + "boxul;": '\U00002518', + "boxur;": '\U00002514', + "boxv;": '\U00002502', + "boxvH;": '\U0000256A', + "boxvL;": '\U00002561', + "boxvR;": '\U0000255E', + "boxvh;": '\U0000253C', + "boxvl;": '\U00002524', + "boxvr;": '\U0000251C', + "bprime;": '\U00002035', + "breve;": '\U000002D8', + "brvbar;": '\U000000A6', + "bscr;": '\U0001D4B7', + "bsemi;": '\U0000204F', + "bsim;": '\U0000223D', + "bsime;": '\U000022CD', + "bsol;": '\U0000005C', + "bsolb;": '\U000029C5', + "bsolhsub;": '\U000027C8', + "bull;": '\U00002022', + "bullet;": '\U00002022', + "bump;": '\U0000224E', + "bumpE;": '\U00002AAE', + "bumpe;": '\U0000224F', + "bumpeq;": '\U0000224F', + "cacute;": '\U00000107', + "cap;": '\U00002229', + "capand;": '\U00002A44', + "capbrcup;": '\U00002A49', + "capcap;": '\U00002A4B', + "capcup;": '\U00002A47', + "capdot;": '\U00002A40', + "caret;": '\U00002041', + "caron;": '\U000002C7', + "ccaps;": '\U00002A4D', + "ccaron;": '\U0000010D', + "ccedil;": '\U000000E7', + "ccirc;": '\U00000109', + "ccups;": '\U00002A4C', + "ccupssm;": '\U00002A50', + "cdot;": '\U0000010B', + "cedil;": '\U000000B8', + "cemptyv;": '\U000029B2', + "cent;": '\U000000A2', + "centerdot;": '\U000000B7', + "cfr;": '\U0001D520', + "chcy;": '\U00000447', + "check;": '\U00002713', + "checkmark;": '\U00002713', + "chi;": '\U000003C7', + "cir;": '\U000025CB', + "cirE;": '\U000029C3', + "circ;": '\U000002C6', + "circeq;": '\U00002257', + "circlearrowleft;": '\U000021BA', + "circlearrowright;": '\U000021BB', + "circledR;": '\U000000AE', + "circledS;": '\U000024C8', + "circledast;": '\U0000229B', + "circledcirc;": '\U0000229A', + "circleddash;": '\U0000229D', + "cire;": '\U00002257', + "cirfnint;": '\U00002A10', + "cirmid;": '\U00002AEF', + "cirscir;": '\U000029C2', + "clubs;": '\U00002663', + "clubsuit;": '\U00002663', + "colon;": '\U0000003A', + "colone;": '\U00002254', + "coloneq;": '\U00002254', + "comma;": '\U0000002C', + "commat;": '\U00000040', + "comp;": '\U00002201', + "compfn;": '\U00002218', + "complement;": '\U00002201', + "complexes;": '\U00002102', + "cong;": '\U00002245', + "congdot;": '\U00002A6D', + "conint;": '\U0000222E', + "copf;": '\U0001D554', + "coprod;": '\U00002210', + "copy;": '\U000000A9', + "copysr;": '\U00002117', + "crarr;": '\U000021B5', + "cross;": '\U00002717', + "cscr;": '\U0001D4B8', + "csub;": '\U00002ACF', + "csube;": '\U00002AD1', + "csup;": '\U00002AD0', + "csupe;": '\U00002AD2', + "ctdot;": '\U000022EF', + "cudarrl;": '\U00002938', + "cudarrr;": '\U00002935', + "cuepr;": '\U000022DE', + "cuesc;": '\U000022DF', + "cularr;": '\U000021B6', + "cularrp;": '\U0000293D', + "cup;": '\U0000222A', + "cupbrcap;": '\U00002A48', + "cupcap;": '\U00002A46', + "cupcup;": '\U00002A4A', + "cupdot;": '\U0000228D', + "cupor;": '\U00002A45', + "curarr;": '\U000021B7', + "curarrm;": '\U0000293C', + "curlyeqprec;": '\U000022DE', + "curlyeqsucc;": '\U000022DF', + "curlyvee;": '\U000022CE', + "curlywedge;": '\U000022CF', + "curren;": '\U000000A4', + "curvearrowleft;": '\U000021B6', + "curvearrowright;": '\U000021B7', + "cuvee;": '\U000022CE', + "cuwed;": '\U000022CF', + "cwconint;": '\U00002232', + "cwint;": '\U00002231', + "cylcty;": '\U0000232D', + "dArr;": '\U000021D3', + "dHar;": '\U00002965', + "dagger;": '\U00002020', + "daleth;": '\U00002138', + "darr;": '\U00002193', + "dash;": '\U00002010', + "dashv;": '\U000022A3', + "dbkarow;": '\U0000290F', + "dblac;": '\U000002DD', + "dcaron;": '\U0000010F', + "dcy;": '\U00000434', + "dd;": '\U00002146', + "ddagger;": '\U00002021', + "ddarr;": '\U000021CA', + "ddotseq;": '\U00002A77', + "deg;": '\U000000B0', + "delta;": '\U000003B4', + "demptyv;": '\U000029B1', + "dfisht;": '\U0000297F', + "dfr;": '\U0001D521', + "dharl;": '\U000021C3', + "dharr;": '\U000021C2', + "diam;": '\U000022C4', + "diamond;": '\U000022C4', + "diamondsuit;": '\U00002666', + "diams;": '\U00002666', + "die;": '\U000000A8', + "digamma;": '\U000003DD', + "disin;": '\U000022F2', + "div;": '\U000000F7', + "divide;": '\U000000F7', + "divideontimes;": '\U000022C7', + "divonx;": '\U000022C7', + "djcy;": '\U00000452', + "dlcorn;": '\U0000231E', + "dlcrop;": '\U0000230D', + "dollar;": '\U00000024', + "dopf;": '\U0001D555', + "dot;": '\U000002D9', + "doteq;": '\U00002250', + "doteqdot;": '\U00002251', + "dotminus;": '\U00002238', + "dotplus;": '\U00002214', + "dotsquare;": '\U000022A1', + "doublebarwedge;": '\U00002306', + "downarrow;": '\U00002193', + "downdownarrows;": '\U000021CA', + "downharpoonleft;": '\U000021C3', + "downharpoonright;": '\U000021C2', + "drbkarow;": '\U00002910', + "drcorn;": '\U0000231F', + "drcrop;": '\U0000230C', + "dscr;": '\U0001D4B9', + "dscy;": '\U00000455', + "dsol;": '\U000029F6', + "dstrok;": '\U00000111', + "dtdot;": '\U000022F1', + "dtri;": '\U000025BF', + "dtrif;": '\U000025BE', + "duarr;": '\U000021F5', + "duhar;": '\U0000296F', + "dwangle;": '\U000029A6', + "dzcy;": '\U0000045F', + "dzigrarr;": '\U000027FF', + "eDDot;": '\U00002A77', + "eDot;": '\U00002251', + "eacute;": '\U000000E9', + "easter;": '\U00002A6E', + "ecaron;": '\U0000011B', + "ecir;": '\U00002256', + "ecirc;": '\U000000EA', + "ecolon;": '\U00002255', + "ecy;": '\U0000044D', + "edot;": '\U00000117', + "ee;": '\U00002147', + "efDot;": '\U00002252', + "efr;": '\U0001D522', + "eg;": '\U00002A9A', + "egrave;": '\U000000E8', + "egs;": '\U00002A96', + "egsdot;": '\U00002A98', + "el;": '\U00002A99', + "elinters;": '\U000023E7', + "ell;": '\U00002113', + "els;": '\U00002A95', + "elsdot;": '\U00002A97', + "emacr;": '\U00000113', + "empty;": '\U00002205', + "emptyset;": '\U00002205', + "emptyv;": '\U00002205', + "emsp;": '\U00002003', + "emsp13;": '\U00002004', + "emsp14;": '\U00002005', + "eng;": '\U0000014B', + "ensp;": '\U00002002', + "eogon;": '\U00000119', + "eopf;": '\U0001D556', + "epar;": '\U000022D5', + "eparsl;": '\U000029E3', + "eplus;": '\U00002A71', + "epsi;": '\U000003B5', + "epsilon;": '\U000003B5', + "epsiv;": '\U000003F5', + "eqcirc;": '\U00002256', + "eqcolon;": '\U00002255', + "eqsim;": '\U00002242', + "eqslantgtr;": '\U00002A96', + "eqslantless;": '\U00002A95', + "equals;": '\U0000003D', + "equest;": '\U0000225F', + "equiv;": '\U00002261', + "equivDD;": '\U00002A78', + "eqvparsl;": '\U000029E5', + "erDot;": '\U00002253', + "erarr;": '\U00002971', + "escr;": '\U0000212F', + "esdot;": '\U00002250', + "esim;": '\U00002242', + "eta;": '\U000003B7', + "eth;": '\U000000F0', + "euml;": '\U000000EB', + "euro;": '\U000020AC', + "excl;": '\U00000021', + "exist;": '\U00002203', + "expectation;": '\U00002130', + "exponentiale;": '\U00002147', + "fallingdotseq;": '\U00002252', + "fcy;": '\U00000444', + "female;": '\U00002640', + "ffilig;": '\U0000FB03', + "fflig;": '\U0000FB00', + "ffllig;": '\U0000FB04', + "ffr;": '\U0001D523', + "filig;": '\U0000FB01', + "flat;": '\U0000266D', + "fllig;": '\U0000FB02', + "fltns;": '\U000025B1', + "fnof;": '\U00000192', + "fopf;": '\U0001D557', + "forall;": '\U00002200', + "fork;": '\U000022D4', + "forkv;": '\U00002AD9', + "fpartint;": '\U00002A0D', + "frac12;": '\U000000BD', + "frac13;": '\U00002153', + "frac14;": '\U000000BC', + "frac15;": '\U00002155', + "frac16;": '\U00002159', + "frac18;": '\U0000215B', + "frac23;": '\U00002154', + "frac25;": '\U00002156', + "frac34;": '\U000000BE', + "frac35;": '\U00002157', + "frac38;": '\U0000215C', + "frac45;": '\U00002158', + "frac56;": '\U0000215A', + "frac58;": '\U0000215D', + "frac78;": '\U0000215E', + "frasl;": '\U00002044', + "frown;": '\U00002322', + "fscr;": '\U0001D4BB', + "gE;": '\U00002267', + "gEl;": '\U00002A8C', + "gacute;": '\U000001F5', + "gamma;": '\U000003B3', + "gammad;": '\U000003DD', + "gap;": '\U00002A86', + "gbreve;": '\U0000011F', + "gcirc;": '\U0000011D', + "gcy;": '\U00000433', + "gdot;": '\U00000121', + "ge;": '\U00002265', + "gel;": '\U000022DB', + "geq;": '\U00002265', + "geqq;": '\U00002267', + "geqslant;": '\U00002A7E', + "ges;": '\U00002A7E', + "gescc;": '\U00002AA9', + "gesdot;": '\U00002A80', + "gesdoto;": '\U00002A82', + "gesdotol;": '\U00002A84', + "gesles;": '\U00002A94', + "gfr;": '\U0001D524', + "gg;": '\U0000226B', + "ggg;": '\U000022D9', + "gimel;": '\U00002137', + "gjcy;": '\U00000453', + "gl;": '\U00002277', + "glE;": '\U00002A92', + "gla;": '\U00002AA5', + "glj;": '\U00002AA4', + "gnE;": '\U00002269', + "gnap;": '\U00002A8A', + "gnapprox;": '\U00002A8A', + "gne;": '\U00002A88', + "gneq;": '\U00002A88', + "gneqq;": '\U00002269', + "gnsim;": '\U000022E7', + "gopf;": '\U0001D558', + "grave;": '\U00000060', + "gscr;": '\U0000210A', + "gsim;": '\U00002273', + "gsime;": '\U00002A8E', + "gsiml;": '\U00002A90', + "gt;": '\U0000003E', + "gtcc;": '\U00002AA7', + "gtcir;": '\U00002A7A', + "gtdot;": '\U000022D7', + "gtlPar;": '\U00002995', + "gtquest;": '\U00002A7C', + "gtrapprox;": '\U00002A86', + "gtrarr;": '\U00002978', + "gtrdot;": '\U000022D7', + "gtreqless;": '\U000022DB', + "gtreqqless;": '\U00002A8C', + "gtrless;": '\U00002277', + "gtrsim;": '\U00002273', + "hArr;": '\U000021D4', + "hairsp;": '\U0000200A', + "half;": '\U000000BD', + "hamilt;": '\U0000210B', + "hardcy;": '\U0000044A', + "harr;": '\U00002194', + "harrcir;": '\U00002948', + "harrw;": '\U000021AD', + "hbar;": '\U0000210F', + "hcirc;": '\U00000125', + "hearts;": '\U00002665', + "heartsuit;": '\U00002665', + "hellip;": '\U00002026', + "hercon;": '\U000022B9', + "hfr;": '\U0001D525', + "hksearow;": '\U00002925', + "hkswarow;": '\U00002926', + "hoarr;": '\U000021FF', + "homtht;": '\U0000223B', + "hookleftarrow;": '\U000021A9', + "hookrightarrow;": '\U000021AA', + "hopf;": '\U0001D559', + "horbar;": '\U00002015', + "hscr;": '\U0001D4BD', + "hslash;": '\U0000210F', + "hstrok;": '\U00000127', + "hybull;": '\U00002043', + "hyphen;": '\U00002010', + "iacute;": '\U000000ED', + "ic;": '\U00002063', + "icirc;": '\U000000EE', + "icy;": '\U00000438', + "iecy;": '\U00000435', + "iexcl;": '\U000000A1', + "iff;": '\U000021D4', + "ifr;": '\U0001D526', + "igrave;": '\U000000EC', + "ii;": '\U00002148', + "iiiint;": '\U00002A0C', + "iiint;": '\U0000222D', + "iinfin;": '\U000029DC', + "iiota;": '\U00002129', + "ijlig;": '\U00000133', + "imacr;": '\U0000012B', + "image;": '\U00002111', + "imagline;": '\U00002110', + "imagpart;": '\U00002111', + "imath;": '\U00000131', + "imof;": '\U000022B7', + "imped;": '\U000001B5', + "in;": '\U00002208', + "incare;": '\U00002105', + "infin;": '\U0000221E', + "infintie;": '\U000029DD', + "inodot;": '\U00000131', + "int;": '\U0000222B', + "intcal;": '\U000022BA', + "integers;": '\U00002124', + "intercal;": '\U000022BA', + "intlarhk;": '\U00002A17', + "intprod;": '\U00002A3C', + "iocy;": '\U00000451', + "iogon;": '\U0000012F', + "iopf;": '\U0001D55A', + "iota;": '\U000003B9', + "iprod;": '\U00002A3C', + "iquest;": '\U000000BF', + "iscr;": '\U0001D4BE', + "isin;": '\U00002208', + "isinE;": '\U000022F9', + "isindot;": '\U000022F5', + "isins;": '\U000022F4', + "isinsv;": '\U000022F3', + "isinv;": '\U00002208', + "it;": '\U00002062', + "itilde;": '\U00000129', + "iukcy;": '\U00000456', + "iuml;": '\U000000EF', + "jcirc;": '\U00000135', + "jcy;": '\U00000439', + "jfr;": '\U0001D527', + "jmath;": '\U00000237', + "jopf;": '\U0001D55B', + "jscr;": '\U0001D4BF', + "jsercy;": '\U00000458', + "jukcy;": '\U00000454', + "kappa;": '\U000003BA', + "kappav;": '\U000003F0', + "kcedil;": '\U00000137', + "kcy;": '\U0000043A', + "kfr;": '\U0001D528', + "kgreen;": '\U00000138', + "khcy;": '\U00000445', + "kjcy;": '\U0000045C', + "kopf;": '\U0001D55C', + "kscr;": '\U0001D4C0', + "lAarr;": '\U000021DA', + "lArr;": '\U000021D0', + "lAtail;": '\U0000291B', + "lBarr;": '\U0000290E', + "lE;": '\U00002266', + "lEg;": '\U00002A8B', + "lHar;": '\U00002962', + "lacute;": '\U0000013A', + "laemptyv;": '\U000029B4', + "lagran;": '\U00002112', + "lambda;": '\U000003BB', + "lang;": '\U000027E8', + "langd;": '\U00002991', + "langle;": '\U000027E8', + "lap;": '\U00002A85', + "laquo;": '\U000000AB', + "larr;": '\U00002190', + "larrb;": '\U000021E4', + "larrbfs;": '\U0000291F', + "larrfs;": '\U0000291D', + "larrhk;": '\U000021A9', + "larrlp;": '\U000021AB', + "larrpl;": '\U00002939', + "larrsim;": '\U00002973', + "larrtl;": '\U000021A2', + "lat;": '\U00002AAB', + "latail;": '\U00002919', + "late;": '\U00002AAD', + "lbarr;": '\U0000290C', + "lbbrk;": '\U00002772', + "lbrace;": '\U0000007B', + "lbrack;": '\U0000005B', + "lbrke;": '\U0000298B', + "lbrksld;": '\U0000298F', + "lbrkslu;": '\U0000298D', + "lcaron;": '\U0000013E', + "lcedil;": '\U0000013C', + "lceil;": '\U00002308', + "lcub;": '\U0000007B', + "lcy;": '\U0000043B', + "ldca;": '\U00002936', + "ldquo;": '\U0000201C', + "ldquor;": '\U0000201E', + "ldrdhar;": '\U00002967', + "ldrushar;": '\U0000294B', + "ldsh;": '\U000021B2', + "le;": '\U00002264', + "leftarrow;": '\U00002190', + "leftarrowtail;": '\U000021A2', + "leftharpoondown;": '\U000021BD', + "leftharpoonup;": '\U000021BC', + "leftleftarrows;": '\U000021C7', + "leftrightarrow;": '\U00002194', + "leftrightarrows;": '\U000021C6', + "leftrightharpoons;": '\U000021CB', + "leftrightsquigarrow;": '\U000021AD', + "leftthreetimes;": '\U000022CB', + "leg;": '\U000022DA', + "leq;": '\U00002264', + "leqq;": '\U00002266', + "leqslant;": '\U00002A7D', + "les;": '\U00002A7D', + "lescc;": '\U00002AA8', + "lesdot;": '\U00002A7F', + "lesdoto;": '\U00002A81', + "lesdotor;": '\U00002A83', + "lesges;": '\U00002A93', + "lessapprox;": '\U00002A85', + "lessdot;": '\U000022D6', + "lesseqgtr;": '\U000022DA', + "lesseqqgtr;": '\U00002A8B', + "lessgtr;": '\U00002276', + "lesssim;": '\U00002272', + "lfisht;": '\U0000297C', + "lfloor;": '\U0000230A', + "lfr;": '\U0001D529', + "lg;": '\U00002276', + "lgE;": '\U00002A91', + "lhard;": '\U000021BD', + "lharu;": '\U000021BC', + "lharul;": '\U0000296A', + "lhblk;": '\U00002584', + "ljcy;": '\U00000459', + "ll;": '\U0000226A', + "llarr;": '\U000021C7', + "llcorner;": '\U0000231E', + "llhard;": '\U0000296B', + "lltri;": '\U000025FA', + "lmidot;": '\U00000140', + "lmoust;": '\U000023B0', + "lmoustache;": '\U000023B0', + "lnE;": '\U00002268', + "lnap;": '\U00002A89', + "lnapprox;": '\U00002A89', + "lne;": '\U00002A87', + "lneq;": '\U00002A87', + "lneqq;": '\U00002268', + "lnsim;": '\U000022E6', + "loang;": '\U000027EC', + "loarr;": '\U000021FD', + "lobrk;": '\U000027E6', + "longleftarrow;": '\U000027F5', + "longleftrightarrow;": '\U000027F7', + "longmapsto;": '\U000027FC', + "longrightarrow;": '\U000027F6', + "looparrowleft;": '\U000021AB', + "looparrowright;": '\U000021AC', + "lopar;": '\U00002985', + "lopf;": '\U0001D55D', + "loplus;": '\U00002A2D', + "lotimes;": '\U00002A34', + "lowast;": '\U00002217', + "lowbar;": '\U0000005F', + "loz;": '\U000025CA', + "lozenge;": '\U000025CA', + "lozf;": '\U000029EB', + "lpar;": '\U00000028', + "lparlt;": '\U00002993', + "lrarr;": '\U000021C6', + "lrcorner;": '\U0000231F', + "lrhar;": '\U000021CB', + "lrhard;": '\U0000296D', + "lrm;": '\U0000200E', + "lrtri;": '\U000022BF', + "lsaquo;": '\U00002039', + "lscr;": '\U0001D4C1', + "lsh;": '\U000021B0', + "lsim;": '\U00002272', + "lsime;": '\U00002A8D', + "lsimg;": '\U00002A8F', + "lsqb;": '\U0000005B', + "lsquo;": '\U00002018', + "lsquor;": '\U0000201A', + "lstrok;": '\U00000142', + "lt;": '\U0000003C', + "ltcc;": '\U00002AA6', + "ltcir;": '\U00002A79', + "ltdot;": '\U000022D6', + "lthree;": '\U000022CB', + "ltimes;": '\U000022C9', + "ltlarr;": '\U00002976', + "ltquest;": '\U00002A7B', + "ltrPar;": '\U00002996', + "ltri;": '\U000025C3', + "ltrie;": '\U000022B4', + "ltrif;": '\U000025C2', + "lurdshar;": '\U0000294A', + "luruhar;": '\U00002966', + "mDDot;": '\U0000223A', + "macr;": '\U000000AF', + "male;": '\U00002642', + "malt;": '\U00002720', + "maltese;": '\U00002720', + "map;": '\U000021A6', + "mapsto;": '\U000021A6', + "mapstodown;": '\U000021A7', + "mapstoleft;": '\U000021A4', + "mapstoup;": '\U000021A5', + "marker;": '\U000025AE', + "mcomma;": '\U00002A29', + "mcy;": '\U0000043C', + "mdash;": '\U00002014', + "measuredangle;": '\U00002221', + "mfr;": '\U0001D52A', + "mho;": '\U00002127', + "micro;": '\U000000B5', + "mid;": '\U00002223', + "midast;": '\U0000002A', + "midcir;": '\U00002AF0', + "middot;": '\U000000B7', + "minus;": '\U00002212', + "minusb;": '\U0000229F', + "minusd;": '\U00002238', + "minusdu;": '\U00002A2A', + "mlcp;": '\U00002ADB', + "mldr;": '\U00002026', + "mnplus;": '\U00002213', + "models;": '\U000022A7', + "mopf;": '\U0001D55E', + "mp;": '\U00002213', + "mscr;": '\U0001D4C2', + "mstpos;": '\U0000223E', + "mu;": '\U000003BC', + "multimap;": '\U000022B8', + "mumap;": '\U000022B8', + "nLeftarrow;": '\U000021CD', + "nLeftrightarrow;": '\U000021CE', + "nRightarrow;": '\U000021CF', + "nVDash;": '\U000022AF', + "nVdash;": '\U000022AE', + "nabla;": '\U00002207', + "nacute;": '\U00000144', + "nap;": '\U00002249', + "napos;": '\U00000149', + "napprox;": '\U00002249', + "natur;": '\U0000266E', + "natural;": '\U0000266E', + "naturals;": '\U00002115', + "nbsp;": '\U000000A0', + "ncap;": '\U00002A43', + "ncaron;": '\U00000148', + "ncedil;": '\U00000146', + "ncong;": '\U00002247', + "ncup;": '\U00002A42', + "ncy;": '\U0000043D', + "ndash;": '\U00002013', + "ne;": '\U00002260', + "neArr;": '\U000021D7', + "nearhk;": '\U00002924', + "nearr;": '\U00002197', + "nearrow;": '\U00002197', + "nequiv;": '\U00002262', + "nesear;": '\U00002928', + "nexist;": '\U00002204', + "nexists;": '\U00002204', + "nfr;": '\U0001D52B', + "nge;": '\U00002271', + "ngeq;": '\U00002271', + "ngsim;": '\U00002275', + "ngt;": '\U0000226F', + "ngtr;": '\U0000226F', + "nhArr;": '\U000021CE', + "nharr;": '\U000021AE', + "nhpar;": '\U00002AF2', + "ni;": '\U0000220B', + "nis;": '\U000022FC', + "nisd;": '\U000022FA', + "niv;": '\U0000220B', + "njcy;": '\U0000045A', + "nlArr;": '\U000021CD', + "nlarr;": '\U0000219A', + "nldr;": '\U00002025', + "nle;": '\U00002270', + "nleftarrow;": '\U0000219A', + "nleftrightarrow;": '\U000021AE', + "nleq;": '\U00002270', + "nless;": '\U0000226E', + "nlsim;": '\U00002274', + "nlt;": '\U0000226E', + "nltri;": '\U000022EA', + "nltrie;": '\U000022EC', + "nmid;": '\U00002224', + "nopf;": '\U0001D55F', + "not;": '\U000000AC', + "notin;": '\U00002209', + "notinva;": '\U00002209', + "notinvb;": '\U000022F7', + "notinvc;": '\U000022F6', + "notni;": '\U0000220C', + "notniva;": '\U0000220C', + "notnivb;": '\U000022FE', + "notnivc;": '\U000022FD', + "npar;": '\U00002226', + "nparallel;": '\U00002226', + "npolint;": '\U00002A14', + "npr;": '\U00002280', + "nprcue;": '\U000022E0', + "nprec;": '\U00002280', + "nrArr;": '\U000021CF', + "nrarr;": '\U0000219B', + "nrightarrow;": '\U0000219B', + "nrtri;": '\U000022EB', + "nrtrie;": '\U000022ED', + "nsc;": '\U00002281', + "nsccue;": '\U000022E1', + "nscr;": '\U0001D4C3', + "nshortmid;": '\U00002224', + "nshortparallel;": '\U00002226', + "nsim;": '\U00002241', + "nsime;": '\U00002244', + "nsimeq;": '\U00002244', + "nsmid;": '\U00002224', + "nspar;": '\U00002226', + "nsqsube;": '\U000022E2', + "nsqsupe;": '\U000022E3', + "nsub;": '\U00002284', + "nsube;": '\U00002288', + "nsubseteq;": '\U00002288', + "nsucc;": '\U00002281', + "nsup;": '\U00002285', + "nsupe;": '\U00002289', + "nsupseteq;": '\U00002289', + "ntgl;": '\U00002279', + "ntilde;": '\U000000F1', + "ntlg;": '\U00002278', + "ntriangleleft;": '\U000022EA', + "ntrianglelefteq;": '\U000022EC', + "ntriangleright;": '\U000022EB', + "ntrianglerighteq;": '\U000022ED', + "nu;": '\U000003BD', + "num;": '\U00000023', + "numero;": '\U00002116', + "numsp;": '\U00002007', + "nvDash;": '\U000022AD', + "nvHarr;": '\U00002904', + "nvdash;": '\U000022AC', + "nvinfin;": '\U000029DE', + "nvlArr;": '\U00002902', + "nvrArr;": '\U00002903', + "nwArr;": '\U000021D6', + "nwarhk;": '\U00002923', + "nwarr;": '\U00002196', + "nwarrow;": '\U00002196', + "nwnear;": '\U00002927', + "oS;": '\U000024C8', + "oacute;": '\U000000F3', + "oast;": '\U0000229B', + "ocir;": '\U0000229A', + "ocirc;": '\U000000F4', + "ocy;": '\U0000043E', + "odash;": '\U0000229D', + "odblac;": '\U00000151', + "odiv;": '\U00002A38', + "odot;": '\U00002299', + "odsold;": '\U000029BC', + "oelig;": '\U00000153', + "ofcir;": '\U000029BF', + "ofr;": '\U0001D52C', + "ogon;": '\U000002DB', + "ograve;": '\U000000F2', + "ogt;": '\U000029C1', + "ohbar;": '\U000029B5', + "ohm;": '\U000003A9', + "oint;": '\U0000222E', + "olarr;": '\U000021BA', + "olcir;": '\U000029BE', + "olcross;": '\U000029BB', + "oline;": '\U0000203E', + "olt;": '\U000029C0', + "omacr;": '\U0000014D', + "omega;": '\U000003C9', + "omicron;": '\U000003BF', + "omid;": '\U000029B6', + "ominus;": '\U00002296', + "oopf;": '\U0001D560', + "opar;": '\U000029B7', + "operp;": '\U000029B9', + "oplus;": '\U00002295', + "or;": '\U00002228', + "orarr;": '\U000021BB', + "ord;": '\U00002A5D', + "order;": '\U00002134', + "orderof;": '\U00002134', + "ordf;": '\U000000AA', + "ordm;": '\U000000BA', + "origof;": '\U000022B6', + "oror;": '\U00002A56', + "orslope;": '\U00002A57', + "orv;": '\U00002A5B', + "oscr;": '\U00002134', + "oslash;": '\U000000F8', + "osol;": '\U00002298', + "otilde;": '\U000000F5', + "otimes;": '\U00002297', + "otimesas;": '\U00002A36', + "ouml;": '\U000000F6', + "ovbar;": '\U0000233D', + "par;": '\U00002225', + "para;": '\U000000B6', + "parallel;": '\U00002225', + "parsim;": '\U00002AF3', + "parsl;": '\U00002AFD', + "part;": '\U00002202', + "pcy;": '\U0000043F', + "percnt;": '\U00000025', + "period;": '\U0000002E', + "permil;": '\U00002030', + "perp;": '\U000022A5', + "pertenk;": '\U00002031', + "pfr;": '\U0001D52D', + "phi;": '\U000003C6', + "phiv;": '\U000003D5', + "phmmat;": '\U00002133', + "phone;": '\U0000260E', + "pi;": '\U000003C0', + "pitchfork;": '\U000022D4', + "piv;": '\U000003D6', + "planck;": '\U0000210F', + "planckh;": '\U0000210E', + "plankv;": '\U0000210F', + "plus;": '\U0000002B', + "plusacir;": '\U00002A23', + "plusb;": '\U0000229E', + "pluscir;": '\U00002A22', + "plusdo;": '\U00002214', + "plusdu;": '\U00002A25', + "pluse;": '\U00002A72', + "plusmn;": '\U000000B1', + "plussim;": '\U00002A26', + "plustwo;": '\U00002A27', + "pm;": '\U000000B1', + "pointint;": '\U00002A15', + "popf;": '\U0001D561', + "pound;": '\U000000A3', + "pr;": '\U0000227A', + "prE;": '\U00002AB3', + "prap;": '\U00002AB7', + "prcue;": '\U0000227C', + "pre;": '\U00002AAF', + "prec;": '\U0000227A', + "precapprox;": '\U00002AB7', + "preccurlyeq;": '\U0000227C', + "preceq;": '\U00002AAF', + "precnapprox;": '\U00002AB9', + "precneqq;": '\U00002AB5', + "precnsim;": '\U000022E8', + "precsim;": '\U0000227E', + "prime;": '\U00002032', + "primes;": '\U00002119', + "prnE;": '\U00002AB5', + "prnap;": '\U00002AB9', + "prnsim;": '\U000022E8', + "prod;": '\U0000220F', + "profalar;": '\U0000232E', + "profline;": '\U00002312', + "profsurf;": '\U00002313', + "prop;": '\U0000221D', + "propto;": '\U0000221D', + "prsim;": '\U0000227E', + "prurel;": '\U000022B0', + "pscr;": '\U0001D4C5', + "psi;": '\U000003C8', + "puncsp;": '\U00002008', + "qfr;": '\U0001D52E', + "qint;": '\U00002A0C', + "qopf;": '\U0001D562', + "qprime;": '\U00002057', + "qscr;": '\U0001D4C6', + "quaternions;": '\U0000210D', + "quatint;": '\U00002A16', + "quest;": '\U0000003F', + "questeq;": '\U0000225F', + "quot;": '\U00000022', + "rAarr;": '\U000021DB', + "rArr;": '\U000021D2', + "rAtail;": '\U0000291C', + "rBarr;": '\U0000290F', + "rHar;": '\U00002964', + "racute;": '\U00000155', + "radic;": '\U0000221A', + "raemptyv;": '\U000029B3', + "rang;": '\U000027E9', + "rangd;": '\U00002992', + "range;": '\U000029A5', + "rangle;": '\U000027E9', + "raquo;": '\U000000BB', + "rarr;": '\U00002192', + "rarrap;": '\U00002975', + "rarrb;": '\U000021E5', + "rarrbfs;": '\U00002920', + "rarrc;": '\U00002933', + "rarrfs;": '\U0000291E', + "rarrhk;": '\U000021AA', + "rarrlp;": '\U000021AC', + "rarrpl;": '\U00002945', + "rarrsim;": '\U00002974', + "rarrtl;": '\U000021A3', + "rarrw;": '\U0000219D', + "ratail;": '\U0000291A', + "ratio;": '\U00002236', + "rationals;": '\U0000211A', + "rbarr;": '\U0000290D', + "rbbrk;": '\U00002773', + "rbrace;": '\U0000007D', + "rbrack;": '\U0000005D', + "rbrke;": '\U0000298C', + "rbrksld;": '\U0000298E', + "rbrkslu;": '\U00002990', + "rcaron;": '\U00000159', + "rcedil;": '\U00000157', + "rceil;": '\U00002309', + "rcub;": '\U0000007D', + "rcy;": '\U00000440', + "rdca;": '\U00002937', + "rdldhar;": '\U00002969', + "rdquo;": '\U0000201D', + "rdquor;": '\U0000201D', + "rdsh;": '\U000021B3', + "real;": '\U0000211C', + "realine;": '\U0000211B', + "realpart;": '\U0000211C', + "reals;": '\U0000211D', + "rect;": '\U000025AD', + "reg;": '\U000000AE', + "rfisht;": '\U0000297D', + "rfloor;": '\U0000230B', + "rfr;": '\U0001D52F', + "rhard;": '\U000021C1', + "rharu;": '\U000021C0', + "rharul;": '\U0000296C', + "rho;": '\U000003C1', + "rhov;": '\U000003F1', + "rightarrow;": '\U00002192', + "rightarrowtail;": '\U000021A3', + "rightharpoondown;": '\U000021C1', + "rightharpoonup;": '\U000021C0', + "rightleftarrows;": '\U000021C4', + "rightleftharpoons;": '\U000021CC', + "rightrightarrows;": '\U000021C9', + "rightsquigarrow;": '\U0000219D', + "rightthreetimes;": '\U000022CC', + "ring;": '\U000002DA', + "risingdotseq;": '\U00002253', + "rlarr;": '\U000021C4', + "rlhar;": '\U000021CC', + "rlm;": '\U0000200F', + "rmoust;": '\U000023B1', + "rmoustache;": '\U000023B1', + "rnmid;": '\U00002AEE', + "roang;": '\U000027ED', + "roarr;": '\U000021FE', + "robrk;": '\U000027E7', + "ropar;": '\U00002986', + "ropf;": '\U0001D563', + "roplus;": '\U00002A2E', + "rotimes;": '\U00002A35', + "rpar;": '\U00000029', + "rpargt;": '\U00002994', + "rppolint;": '\U00002A12', + "rrarr;": '\U000021C9', + "rsaquo;": '\U0000203A', + "rscr;": '\U0001D4C7', + "rsh;": '\U000021B1', + "rsqb;": '\U0000005D', + "rsquo;": '\U00002019', + "rsquor;": '\U00002019', + "rthree;": '\U000022CC', + "rtimes;": '\U000022CA', + "rtri;": '\U000025B9', + "rtrie;": '\U000022B5', + "rtrif;": '\U000025B8', + "rtriltri;": '\U000029CE', + "ruluhar;": '\U00002968', + "rx;": '\U0000211E', + "sacute;": '\U0000015B', + "sbquo;": '\U0000201A', + "sc;": '\U0000227B', + "scE;": '\U00002AB4', + "scap;": '\U00002AB8', + "scaron;": '\U00000161', + "sccue;": '\U0000227D', + "sce;": '\U00002AB0', + "scedil;": '\U0000015F', + "scirc;": '\U0000015D', + "scnE;": '\U00002AB6', + "scnap;": '\U00002ABA', + "scnsim;": '\U000022E9', + "scpolint;": '\U00002A13', + "scsim;": '\U0000227F', + "scy;": '\U00000441', + "sdot;": '\U000022C5', + "sdotb;": '\U000022A1', + "sdote;": '\U00002A66', + "seArr;": '\U000021D8', + "searhk;": '\U00002925', + "searr;": '\U00002198', + "searrow;": '\U00002198', + "sect;": '\U000000A7', + "semi;": '\U0000003B', + "seswar;": '\U00002929', + "setminus;": '\U00002216', + "setmn;": '\U00002216', + "sext;": '\U00002736', + "sfr;": '\U0001D530', + "sfrown;": '\U00002322', + "sharp;": '\U0000266F', + "shchcy;": '\U00000449', + "shcy;": '\U00000448', + "shortmid;": '\U00002223', + "shortparallel;": '\U00002225', + "shy;": '\U000000AD', + "sigma;": '\U000003C3', + "sigmaf;": '\U000003C2', + "sigmav;": '\U000003C2', + "sim;": '\U0000223C', + "simdot;": '\U00002A6A', + "sime;": '\U00002243', + "simeq;": '\U00002243', + "simg;": '\U00002A9E', + "simgE;": '\U00002AA0', + "siml;": '\U00002A9D', + "simlE;": '\U00002A9F', + "simne;": '\U00002246', + "simplus;": '\U00002A24', + "simrarr;": '\U00002972', + "slarr;": '\U00002190', + "smallsetminus;": '\U00002216', + "smashp;": '\U00002A33', + "smeparsl;": '\U000029E4', + "smid;": '\U00002223', + "smile;": '\U00002323', + "smt;": '\U00002AAA', + "smte;": '\U00002AAC', + "softcy;": '\U0000044C', + "sol;": '\U0000002F', + "solb;": '\U000029C4', + "solbar;": '\U0000233F', + "sopf;": '\U0001D564', + "spades;": '\U00002660', + "spadesuit;": '\U00002660', + "spar;": '\U00002225', + "sqcap;": '\U00002293', + "sqcup;": '\U00002294', + "sqsub;": '\U0000228F', + "sqsube;": '\U00002291', + "sqsubset;": '\U0000228F', + "sqsubseteq;": '\U00002291', + "sqsup;": '\U00002290', + "sqsupe;": '\U00002292', + "sqsupset;": '\U00002290', + "sqsupseteq;": '\U00002292', + "squ;": '\U000025A1', + "square;": '\U000025A1', + "squarf;": '\U000025AA', + "squf;": '\U000025AA', + "srarr;": '\U00002192', + "sscr;": '\U0001D4C8', + "ssetmn;": '\U00002216', + "ssmile;": '\U00002323', + "sstarf;": '\U000022C6', + "star;": '\U00002606', + "starf;": '\U00002605', + "straightepsilon;": '\U000003F5', + "straightphi;": '\U000003D5', + "strns;": '\U000000AF', + "sub;": '\U00002282', + "subE;": '\U00002AC5', + "subdot;": '\U00002ABD', + "sube;": '\U00002286', + "subedot;": '\U00002AC3', + "submult;": '\U00002AC1', + "subnE;": '\U00002ACB', + "subne;": '\U0000228A', + "subplus;": '\U00002ABF', + "subrarr;": '\U00002979', + "subset;": '\U00002282', + "subseteq;": '\U00002286', + "subseteqq;": '\U00002AC5', + "subsetneq;": '\U0000228A', + "subsetneqq;": '\U00002ACB', + "subsim;": '\U00002AC7', + "subsub;": '\U00002AD5', + "subsup;": '\U00002AD3', + "succ;": '\U0000227B', + "succapprox;": '\U00002AB8', + "succcurlyeq;": '\U0000227D', + "succeq;": '\U00002AB0', + "succnapprox;": '\U00002ABA', + "succneqq;": '\U00002AB6', + "succnsim;": '\U000022E9', + "succsim;": '\U0000227F', + "sum;": '\U00002211', + "sung;": '\U0000266A', + "sup;": '\U00002283', + "sup1;": '\U000000B9', + "sup2;": '\U000000B2', + "sup3;": '\U000000B3', + "supE;": '\U00002AC6', + "supdot;": '\U00002ABE', + "supdsub;": '\U00002AD8', + "supe;": '\U00002287', + "supedot;": '\U00002AC4', + "suphsol;": '\U000027C9', + "suphsub;": '\U00002AD7', + "suplarr;": '\U0000297B', + "supmult;": '\U00002AC2', + "supnE;": '\U00002ACC', + "supne;": '\U0000228B', + "supplus;": '\U00002AC0', + "supset;": '\U00002283', + "supseteq;": '\U00002287', + "supseteqq;": '\U00002AC6', + "supsetneq;": '\U0000228B', + "supsetneqq;": '\U00002ACC', + "supsim;": '\U00002AC8', + "supsub;": '\U00002AD4', + "supsup;": '\U00002AD6', + "swArr;": '\U000021D9', + "swarhk;": '\U00002926', + "swarr;": '\U00002199', + "swarrow;": '\U00002199', + "swnwar;": '\U0000292A', + "szlig;": '\U000000DF', + "target;": '\U00002316', + "tau;": '\U000003C4', + "tbrk;": '\U000023B4', + "tcaron;": '\U00000165', + "tcedil;": '\U00000163', + "tcy;": '\U00000442', + "tdot;": '\U000020DB', + "telrec;": '\U00002315', + "tfr;": '\U0001D531', + "there4;": '\U00002234', + "therefore;": '\U00002234', + "theta;": '\U000003B8', + "thetasym;": '\U000003D1', + "thetav;": '\U000003D1', + "thickapprox;": '\U00002248', + "thicksim;": '\U0000223C', + "thinsp;": '\U00002009', + "thkap;": '\U00002248', + "thksim;": '\U0000223C', + "thorn;": '\U000000FE', + "tilde;": '\U000002DC', + "times;": '\U000000D7', + "timesb;": '\U000022A0', + "timesbar;": '\U00002A31', + "timesd;": '\U00002A30', + "tint;": '\U0000222D', + "toea;": '\U00002928', + "top;": '\U000022A4', + "topbot;": '\U00002336', + "topcir;": '\U00002AF1', + "topf;": '\U0001D565', + "topfork;": '\U00002ADA', + "tosa;": '\U00002929', + "tprime;": '\U00002034', + "trade;": '\U00002122', + "triangle;": '\U000025B5', + "triangledown;": '\U000025BF', + "triangleleft;": '\U000025C3', + "trianglelefteq;": '\U000022B4', + "triangleq;": '\U0000225C', + "triangleright;": '\U000025B9', + "trianglerighteq;": '\U000022B5', + "tridot;": '\U000025EC', + "trie;": '\U0000225C', + "triminus;": '\U00002A3A', + "triplus;": '\U00002A39', + "trisb;": '\U000029CD', + "tritime;": '\U00002A3B', + "trpezium;": '\U000023E2', + "tscr;": '\U0001D4C9', + "tscy;": '\U00000446', + "tshcy;": '\U0000045B', + "tstrok;": '\U00000167', + "twixt;": '\U0000226C', + "twoheadleftarrow;": '\U0000219E', + "twoheadrightarrow;": '\U000021A0', + "uArr;": '\U000021D1', + "uHar;": '\U00002963', + "uacute;": '\U000000FA', + "uarr;": '\U00002191', + "ubrcy;": '\U0000045E', + "ubreve;": '\U0000016D', + "ucirc;": '\U000000FB', + "ucy;": '\U00000443', + "udarr;": '\U000021C5', + "udblac;": '\U00000171', + "udhar;": '\U0000296E', + "ufisht;": '\U0000297E', + "ufr;": '\U0001D532', + "ugrave;": '\U000000F9', + "uharl;": '\U000021BF', + "uharr;": '\U000021BE', + "uhblk;": '\U00002580', + "ulcorn;": '\U0000231C', + "ulcorner;": '\U0000231C', + "ulcrop;": '\U0000230F', + "ultri;": '\U000025F8', + "umacr;": '\U0000016B', + "uml;": '\U000000A8', + "uogon;": '\U00000173', + "uopf;": '\U0001D566', + "uparrow;": '\U00002191', + "updownarrow;": '\U00002195', + "upharpoonleft;": '\U000021BF', + "upharpoonright;": '\U000021BE', + "uplus;": '\U0000228E', + "upsi;": '\U000003C5', + "upsih;": '\U000003D2', + "upsilon;": '\U000003C5', + "upuparrows;": '\U000021C8', + "urcorn;": '\U0000231D', + "urcorner;": '\U0000231D', + "urcrop;": '\U0000230E', + "uring;": '\U0000016F', + "urtri;": '\U000025F9', + "uscr;": '\U0001D4CA', + "utdot;": '\U000022F0', + "utilde;": '\U00000169', + "utri;": '\U000025B5', + "utrif;": '\U000025B4', + "uuarr;": '\U000021C8', + "uuml;": '\U000000FC', + "uwangle;": '\U000029A7', + "vArr;": '\U000021D5', + "vBar;": '\U00002AE8', + "vBarv;": '\U00002AE9', + "vDash;": '\U000022A8', + "vangrt;": '\U0000299C', + "varepsilon;": '\U000003F5', + "varkappa;": '\U000003F0', + "varnothing;": '\U00002205', + "varphi;": '\U000003D5', + "varpi;": '\U000003D6', + "varpropto;": '\U0000221D', + "varr;": '\U00002195', + "varrho;": '\U000003F1', + "varsigma;": '\U000003C2', + "vartheta;": '\U000003D1', + "vartriangleleft;": '\U000022B2', + "vartriangleright;": '\U000022B3', + "vcy;": '\U00000432', + "vdash;": '\U000022A2', + "vee;": '\U00002228', + "veebar;": '\U000022BB', + "veeeq;": '\U0000225A', + "vellip;": '\U000022EE', + "verbar;": '\U0000007C', + "vert;": '\U0000007C', + "vfr;": '\U0001D533', + "vltri;": '\U000022B2', + "vopf;": '\U0001D567', + "vprop;": '\U0000221D', + "vrtri;": '\U000022B3', + "vscr;": '\U0001D4CB', + "vzigzag;": '\U0000299A', + "wcirc;": '\U00000175', + "wedbar;": '\U00002A5F', + "wedge;": '\U00002227', + "wedgeq;": '\U00002259', + "weierp;": '\U00002118', + "wfr;": '\U0001D534', + "wopf;": '\U0001D568', + "wp;": '\U00002118', + "wr;": '\U00002240', + "wreath;": '\U00002240', + "wscr;": '\U0001D4CC', + "xcap;": '\U000022C2', + "xcirc;": '\U000025EF', + "xcup;": '\U000022C3', + "xdtri;": '\U000025BD', + "xfr;": '\U0001D535', + "xhArr;": '\U000027FA', + "xharr;": '\U000027F7', + "xi;": '\U000003BE', + "xlArr;": '\U000027F8', + "xlarr;": '\U000027F5', + "xmap;": '\U000027FC', + "xnis;": '\U000022FB', + "xodot;": '\U00002A00', + "xopf;": '\U0001D569', + "xoplus;": '\U00002A01', + "xotime;": '\U00002A02', + "xrArr;": '\U000027F9', + "xrarr;": '\U000027F6', + "xscr;": '\U0001D4CD', + "xsqcup;": '\U00002A06', + "xuplus;": '\U00002A04', + "xutri;": '\U000025B3', + "xvee;": '\U000022C1', + "xwedge;": '\U000022C0', + "yacute;": '\U000000FD', + "yacy;": '\U0000044F', + "ycirc;": '\U00000177', + "ycy;": '\U0000044B', + "yen;": '\U000000A5', + "yfr;": '\U0001D536', + "yicy;": '\U00000457', + "yopf;": '\U0001D56A', + "yscr;": '\U0001D4CE', + "yucy;": '\U0000044E', + "yuml;": '\U000000FF', + "zacute;": '\U0000017A', + "zcaron;": '\U0000017E', + "zcy;": '\U00000437', + "zdot;": '\U0000017C', + "zeetrf;": '\U00002128', + "zeta;": '\U000003B6', + "zfr;": '\U0001D537', + "zhcy;": '\U00000436', + "zigrarr;": '\U000021DD', + "zopf;": '\U0001D56B', + "zscr;": '\U0001D4CF', + "zwj;": '\U0000200D', + "zwnj;": '\U0000200C', + "AElig": '\U000000C6', + "AMP": '\U00000026', + "Aacute": '\U000000C1', + "Acirc": '\U000000C2', + "Agrave": '\U000000C0', + "Aring": '\U000000C5', + "Atilde": '\U000000C3', + "Auml": '\U000000C4', + "COPY": '\U000000A9', + "Ccedil": '\U000000C7', + "ETH": '\U000000D0', + "Eacute": '\U000000C9', + "Ecirc": '\U000000CA', + "Egrave": '\U000000C8', + "Euml": '\U000000CB', + "GT": '\U0000003E', + "Iacute": '\U000000CD', + "Icirc": '\U000000CE', + "Igrave": '\U000000CC', + "Iuml": '\U000000CF', + "LT": '\U0000003C', + "Ntilde": '\U000000D1', + "Oacute": '\U000000D3', + "Ocirc": '\U000000D4', + "Ograve": '\U000000D2', + "Oslash": '\U000000D8', + "Otilde": '\U000000D5', + "Ouml": '\U000000D6', + "QUOT": '\U00000022', + "REG": '\U000000AE', + "THORN": '\U000000DE', + "Uacute": '\U000000DA', + "Ucirc": '\U000000DB', + "Ugrave": '\U000000D9', + "Uuml": '\U000000DC', + "Yacute": '\U000000DD', + "aacute": '\U000000E1', + "acirc": '\U000000E2', + "acute": '\U000000B4', + "aelig": '\U000000E6', + "agrave": '\U000000E0', + "amp": '\U00000026', + "aring": '\U000000E5', + "atilde": '\U000000E3', + "auml": '\U000000E4', + "brvbar": '\U000000A6', + "ccedil": '\U000000E7', + "cedil": '\U000000B8', + "cent": '\U000000A2', + "copy": '\U000000A9', + "curren": '\U000000A4', + "deg": '\U000000B0', + "divide": '\U000000F7', + "eacute": '\U000000E9', + "ecirc": '\U000000EA', + "egrave": '\U000000E8', + "eth": '\U000000F0', + "euml": '\U000000EB', + "frac12": '\U000000BD', + "frac14": '\U000000BC', + "frac34": '\U000000BE', + "gt": '\U0000003E', + "iacute": '\U000000ED', + "icirc": '\U000000EE', + "iexcl": '\U000000A1', + "igrave": '\U000000EC', + "iquest": '\U000000BF', + "iuml": '\U000000EF', + "laquo": '\U000000AB', + "lt": '\U0000003C', + "macr": '\U000000AF', + "micro": '\U000000B5', + "middot": '\U000000B7', + "nbsp": '\U000000A0', + "not": '\U000000AC', + "ntilde": '\U000000F1', + "oacute": '\U000000F3', + "ocirc": '\U000000F4', + "ograve": '\U000000F2', + "ordf": '\U000000AA', + "ordm": '\U000000BA', + "oslash": '\U000000F8', + "otilde": '\U000000F5', + "ouml": '\U000000F6', + "para": '\U000000B6', + "plusmn": '\U000000B1', + "pound": '\U000000A3', + "quot": '\U00000022', + "raquo": '\U000000BB', + "reg": '\U000000AE', + "sect": '\U000000A7', + "shy": '\U000000AD', + "sup1": '\U000000B9', + "sup2": '\U000000B2', + "sup3": '\U000000B3', + "szlig": '\U000000DF', + "thorn": '\U000000FE', + "times": '\U000000D7', + "uacute": '\U000000FA', + "ucirc": '\U000000FB', + "ugrave": '\U000000F9', + "uml": '\U000000A8', + "uuml": '\U000000FC', + "yacute": '\U000000FD', + "yen": '\U000000A5', + "yuml": '\U000000FF', +} + +// HTML entities that are two unicode codepoints. +var entity2 = map[string][2]rune{ + // TODO(nigeltao): Handle replacements that are wider than their names. + // "nLt;": {'\u226A', '\u20D2'}, + // "nGt;": {'\u226B', '\u20D2'}, + "NotEqualTilde;": {'\u2242', '\u0338'}, + "NotGreaterFullEqual;": {'\u2267', '\u0338'}, + "NotGreaterGreater;": {'\u226B', '\u0338'}, + "NotGreaterSlantEqual;": {'\u2A7E', '\u0338'}, + "NotHumpDownHump;": {'\u224E', '\u0338'}, + "NotHumpEqual;": {'\u224F', '\u0338'}, + "NotLeftTriangleBar;": {'\u29CF', '\u0338'}, + "NotLessLess;": {'\u226A', '\u0338'}, + "NotLessSlantEqual;": {'\u2A7D', '\u0338'}, + "NotNestedGreaterGreater;": {'\u2AA2', '\u0338'}, + "NotNestedLessLess;": {'\u2AA1', '\u0338'}, + "NotPrecedesEqual;": {'\u2AAF', '\u0338'}, + "NotRightTriangleBar;": {'\u29D0', '\u0338'}, + "NotSquareSubset;": {'\u228F', '\u0338'}, + "NotSquareSuperset;": {'\u2290', '\u0338'}, + "NotSubset;": {'\u2282', '\u20D2'}, + "NotSucceedsEqual;": {'\u2AB0', '\u0338'}, + "NotSucceedsTilde;": {'\u227F', '\u0338'}, + "NotSuperset;": {'\u2283', '\u20D2'}, + "ThickSpace;": {'\u205F', '\u200A'}, + "acE;": {'\u223E', '\u0333'}, + "bne;": {'\u003D', '\u20E5'}, + "bnequiv;": {'\u2261', '\u20E5'}, + "caps;": {'\u2229', '\uFE00'}, + "cups;": {'\u222A', '\uFE00'}, + "fjlig;": {'\u0066', '\u006A'}, + "gesl;": {'\u22DB', '\uFE00'}, + "gvertneqq;": {'\u2269', '\uFE00'}, + "gvnE;": {'\u2269', '\uFE00'}, + "lates;": {'\u2AAD', '\uFE00'}, + "lesg;": {'\u22DA', '\uFE00'}, + "lvertneqq;": {'\u2268', '\uFE00'}, + "lvnE;": {'\u2268', '\uFE00'}, + "nGg;": {'\u22D9', '\u0338'}, + "nGtv;": {'\u226B', '\u0338'}, + "nLl;": {'\u22D8', '\u0338'}, + "nLtv;": {'\u226A', '\u0338'}, + "nang;": {'\u2220', '\u20D2'}, + "napE;": {'\u2A70', '\u0338'}, + "napid;": {'\u224B', '\u0338'}, + "nbump;": {'\u224E', '\u0338'}, + "nbumpe;": {'\u224F', '\u0338'}, + "ncongdot;": {'\u2A6D', '\u0338'}, + "nedot;": {'\u2250', '\u0338'}, + "nesim;": {'\u2242', '\u0338'}, + "ngE;": {'\u2267', '\u0338'}, + "ngeqq;": {'\u2267', '\u0338'}, + "ngeqslant;": {'\u2A7E', '\u0338'}, + "nges;": {'\u2A7E', '\u0338'}, + "nlE;": {'\u2266', '\u0338'}, + "nleqq;": {'\u2266', '\u0338'}, + "nleqslant;": {'\u2A7D', '\u0338'}, + "nles;": {'\u2A7D', '\u0338'}, + "notinE;": {'\u22F9', '\u0338'}, + "notindot;": {'\u22F5', '\u0338'}, + "nparsl;": {'\u2AFD', '\u20E5'}, + "npart;": {'\u2202', '\u0338'}, + "npre;": {'\u2AAF', '\u0338'}, + "npreceq;": {'\u2AAF', '\u0338'}, + "nrarrc;": {'\u2933', '\u0338'}, + "nrarrw;": {'\u219D', '\u0338'}, + "nsce;": {'\u2AB0', '\u0338'}, + "nsubE;": {'\u2AC5', '\u0338'}, + "nsubset;": {'\u2282', '\u20D2'}, + "nsubseteqq;": {'\u2AC5', '\u0338'}, + "nsucceq;": {'\u2AB0', '\u0338'}, + "nsupE;": {'\u2AC6', '\u0338'}, + "nsupset;": {'\u2283', '\u20D2'}, + "nsupseteqq;": {'\u2AC6', '\u0338'}, + "nvap;": {'\u224D', '\u20D2'}, + "nvge;": {'\u2265', '\u20D2'}, + "nvgt;": {'\u003E', '\u20D2'}, + "nvle;": {'\u2264', '\u20D2'}, + "nvlt;": {'\u003C', '\u20D2'}, + "nvltrie;": {'\u22B4', '\u20D2'}, + "nvrtrie;": {'\u22B5', '\u20D2'}, + "nvsim;": {'\u223C', '\u20D2'}, + "race;": {'\u223D', '\u0331'}, + "smtes;": {'\u2AAC', '\uFE00'}, + "sqcaps;": {'\u2293', '\uFE00'}, + "sqcups;": {'\u2294', '\uFE00'}, + "varsubsetneq;": {'\u228A', '\uFE00'}, + "varsubsetneqq;": {'\u2ACB', '\uFE00'}, + "varsupsetneq;": {'\u228B', '\uFE00'}, + "varsupsetneqq;": {'\u2ACC', '\uFE00'}, + "vnsub;": {'\u2282', '\u20D2'}, + "vnsup;": {'\u2283', '\u20D2'}, + "vsubnE;": {'\u2ACB', '\uFE00'}, + "vsubne;": {'\u228A', '\uFE00'}, + "vsupnE;": {'\u2ACC', '\uFE00'}, + "vsupne;": {'\u228B', '\uFE00'}, +} diff --git a/vendor/golang.org/x/net/html/escape.go b/vendor/golang.org/x/net/html/escape.go new file mode 100644 index 0000000..d856139 --- /dev/null +++ b/vendor/golang.org/x/net/html/escape.go @@ -0,0 +1,258 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "strings" + "unicode/utf8" +) + +// These replacements permit compatibility with old numeric entities that +// assumed Windows-1252 encoding. +// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference +var replacementTable = [...]rune{ + '\u20AC', // First entry is what 0x80 should be replaced with. + '\u0081', + '\u201A', + '\u0192', + '\u201E', + '\u2026', + '\u2020', + '\u2021', + '\u02C6', + '\u2030', + '\u0160', + '\u2039', + '\u0152', + '\u008D', + '\u017D', + '\u008F', + '\u0090', + '\u2018', + '\u2019', + '\u201C', + '\u201D', + '\u2022', + '\u2013', + '\u2014', + '\u02DC', + '\u2122', + '\u0161', + '\u203A', + '\u0153', + '\u009D', + '\u017E', + '\u0178', // Last entry is 0x9F. + // 0x00->'\uFFFD' is handled programmatically. + // 0x0D->'\u000D' is a no-op. +} + +// unescapeEntity reads an entity like "<" from b[src:] and writes the +// corresponding "<" to b[dst:], returning the incremented dst and src cursors. +// Precondition: b[src] == '&' && dst <= src. +// attribute should be true if parsing an attribute value. +func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { + // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference + + // i starts at 1 because we already know that s[0] == '&'. + i, s := 1, b[src:] + + if len(s) <= 1 { + b[dst] = b[src] + return dst + 1, src + 1 + } + + if s[i] == '#' { + if len(s) <= 3 { // We need to have at least ".". + b[dst] = b[src] + return dst + 1, src + 1 + } + i++ + c := s[i] + hex := false + if c == 'x' || c == 'X' { + hex = true + i++ + } + + x := '\x00' + for i < len(s) { + c = s[i] + i++ + if hex { + if '0' <= c && c <= '9' { + x = 16*x + rune(c) - '0' + continue + } else if 'a' <= c && c <= 'f' { + x = 16*x + rune(c) - 'a' + 10 + continue + } else if 'A' <= c && c <= 'F' { + x = 16*x + rune(c) - 'A' + 10 + continue + } + } else if '0' <= c && c <= '9' { + x = 10*x + rune(c) - '0' + continue + } + if c != ';' { + i-- + } + break + } + + if i <= 3 { // No characters matched. + b[dst] = b[src] + return dst + 1, src + 1 + } + + if 0x80 <= x && x <= 0x9F { + // Replace characters from Windows-1252 with UTF-8 equivalents. + x = replacementTable[x-0x80] + } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { + // Replace invalid characters with the replacement character. + x = '\uFFFD' + } + + return dst + utf8.EncodeRune(b[dst:], x), src + i + } + + // Consume the maximum number of characters possible, with the + // consumed characters matching one of the named references. + + for i < len(s) { + c := s[i] + i++ + // Lower-cased characters are more common in entities, so we check for them first. + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { + continue + } + if c != ';' { + i-- + } + break + } + + entityName := string(s[1:i]) + if entityName == "" { + // No-op. + } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { + // No-op. + } else if x := entity[entityName]; x != 0 { + return dst + utf8.EncodeRune(b[dst:], x), src + i + } else if x := entity2[entityName]; x[0] != 0 { + dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) + return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i + } else if !attribute { + maxLen := len(entityName) - 1 + if maxLen > longestEntityWithoutSemicolon { + maxLen = longestEntityWithoutSemicolon + } + for j := maxLen; j > 1; j-- { + if x := entity[entityName[:j]]; x != 0 { + return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 + } + } + } + + dst1, src1 = dst+i, src+i + copy(b[dst:dst1], b[src:src1]) + return dst1, src1 +} + +// unescape unescapes b's entities in-place, so that "a<b" becomes "a': + esc = ">" + case '"': + // """ is shorter than """. + esc = """ + case '\r': + esc = " " + default: + panic("unrecognized escape character") + } + s = s[i+1:] + if _, err := w.WriteString(esc); err != nil { + return err + } + i = strings.IndexAny(s, escapedChars) + } + _, err := w.WriteString(s) + return err +} + +// EscapeString escapes special characters like "<" to become "<". It +// escapes only five such characters: <, >, &, ' and ". +// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't +// always true. +func EscapeString(s string) string { + if strings.IndexAny(s, escapedChars) == -1 { + return s + } + var buf bytes.Buffer + escape(&buf, s) + return buf.String() +} + +// UnescapeString unescapes entities like "<" to become "<". It unescapes a +// larger range of entities than EscapeString escapes. For example, "á" +// unescapes to "á", as does "á" and "&xE1;". +// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't +// always true. +func UnescapeString(s string) string { + for _, c := range s { + if c == '&' { + return string(unescape([]byte(s), false)) + } + } + return s +} diff --git a/vendor/golang.org/x/net/html/foreign.go b/vendor/golang.org/x/net/html/foreign.go new file mode 100644 index 0000000..9da9e9d --- /dev/null +++ b/vendor/golang.org/x/net/html/foreign.go @@ -0,0 +1,222 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "strings" +) + +func adjustAttributeNames(aa []Attribute, nameMap map[string]string) { + for i := range aa { + if newName, ok := nameMap[aa[i].Key]; ok { + aa[i].Key = newName + } + } +} + +func adjustForeignAttributes(aa []Attribute) { + for i, a := range aa { + if a.Key == "" || a.Key[0] != 'x' { + continue + } + switch a.Key { + case "xlink:actuate", "xlink:arcrole", "xlink:href", "xlink:role", "xlink:show", + "xlink:title", "xlink:type", "xml:base", "xml:lang", "xml:space", "xmlns:xlink": + j := strings.Index(a.Key, ":") + aa[i].Namespace = a.Key[:j] + aa[i].Key = a.Key[j+1:] + } + } +} + +func htmlIntegrationPoint(n *Node) bool { + if n.Type != ElementNode { + return false + } + switch n.Namespace { + case "math": + if n.Data == "annotation-xml" { + for _, a := range n.Attr { + if a.Key == "encoding" { + val := strings.ToLower(a.Val) + if val == "text/html" || val == "application/xhtml+xml" { + return true + } + } + } + } + case "svg": + switch n.Data { + case "desc", "foreignObject", "title": + return true + } + } + return false +} + +func mathMLTextIntegrationPoint(n *Node) bool { + if n.Namespace != "math" { + return false + } + switch n.Data { + case "mi", "mo", "mn", "ms", "mtext": + return true + } + return false +} + +// Section 12.2.6.5. +var breakout = map[string]bool{ + "b": true, + "big": true, + "blockquote": true, + "body": true, + "br": true, + "center": true, + "code": true, + "dd": true, + "div": true, + "dl": true, + "dt": true, + "em": true, + "embed": true, + "h1": true, + "h2": true, + "h3": true, + "h4": true, + "h5": true, + "h6": true, + "head": true, + "hr": true, + "i": true, + "img": true, + "li": true, + "listing": true, + "menu": true, + "meta": true, + "nobr": true, + "ol": true, + "p": true, + "pre": true, + "ruby": true, + "s": true, + "small": true, + "span": true, + "strong": true, + "strike": true, + "sub": true, + "sup": true, + "table": true, + "tt": true, + "u": true, + "ul": true, + "var": true, +} + +// Section 12.2.6.5. +var svgTagNameAdjustments = map[string]string{ + "altglyph": "altGlyph", + "altglyphdef": "altGlyphDef", + "altglyphitem": "altGlyphItem", + "animatecolor": "animateColor", + "animatemotion": "animateMotion", + "animatetransform": "animateTransform", + "clippath": "clipPath", + "feblend": "feBlend", + "fecolormatrix": "feColorMatrix", + "fecomponenttransfer": "feComponentTransfer", + "fecomposite": "feComposite", + "feconvolvematrix": "feConvolveMatrix", + "fediffuselighting": "feDiffuseLighting", + "fedisplacementmap": "feDisplacementMap", + "fedistantlight": "feDistantLight", + "feflood": "feFlood", + "fefunca": "feFuncA", + "fefuncb": "feFuncB", + "fefuncg": "feFuncG", + "fefuncr": "feFuncR", + "fegaussianblur": "feGaussianBlur", + "feimage": "feImage", + "femerge": "feMerge", + "femergenode": "feMergeNode", + "femorphology": "feMorphology", + "feoffset": "feOffset", + "fepointlight": "fePointLight", + "fespecularlighting": "feSpecularLighting", + "fespotlight": "feSpotLight", + "fetile": "feTile", + "feturbulence": "feTurbulence", + "foreignobject": "foreignObject", + "glyphref": "glyphRef", + "lineargradient": "linearGradient", + "radialgradient": "radialGradient", + "textpath": "textPath", +} + +// Section 12.2.6.1 +var mathMLAttributeAdjustments = map[string]string{ + "definitionurl": "definitionURL", +} + +var svgAttributeAdjustments = map[string]string{ + "attributename": "attributeName", + "attributetype": "attributeType", + "basefrequency": "baseFrequency", + "baseprofile": "baseProfile", + "calcmode": "calcMode", + "clippathunits": "clipPathUnits", + "diffuseconstant": "diffuseConstant", + "edgemode": "edgeMode", + "filterunits": "filterUnits", + "glyphref": "glyphRef", + "gradienttransform": "gradientTransform", + "gradientunits": "gradientUnits", + "kernelmatrix": "kernelMatrix", + "kernelunitlength": "kernelUnitLength", + "keypoints": "keyPoints", + "keysplines": "keySplines", + "keytimes": "keyTimes", + "lengthadjust": "lengthAdjust", + "limitingconeangle": "limitingConeAngle", + "markerheight": "markerHeight", + "markerunits": "markerUnits", + "markerwidth": "markerWidth", + "maskcontentunits": "maskContentUnits", + "maskunits": "maskUnits", + "numoctaves": "numOctaves", + "pathlength": "pathLength", + "patterncontentunits": "patternContentUnits", + "patterntransform": "patternTransform", + "patternunits": "patternUnits", + "pointsatx": "pointsAtX", + "pointsaty": "pointsAtY", + "pointsatz": "pointsAtZ", + "preservealpha": "preserveAlpha", + "preserveaspectratio": "preserveAspectRatio", + "primitiveunits": "primitiveUnits", + "refx": "refX", + "refy": "refY", + "repeatcount": "repeatCount", + "repeatdur": "repeatDur", + "requiredextensions": "requiredExtensions", + "requiredfeatures": "requiredFeatures", + "specularconstant": "specularConstant", + "specularexponent": "specularExponent", + "spreadmethod": "spreadMethod", + "startoffset": "startOffset", + "stddeviation": "stdDeviation", + "stitchtiles": "stitchTiles", + "surfacescale": "surfaceScale", + "systemlanguage": "systemLanguage", + "tablevalues": "tableValues", + "targetx": "targetX", + "targety": "targetY", + "textlength": "textLength", + "viewbox": "viewBox", + "viewtarget": "viewTarget", + "xchannelselector": "xChannelSelector", + "ychannelselector": "yChannelSelector", + "zoomandpan": "zoomAndPan", +} diff --git a/vendor/golang.org/x/net/html/node.go b/vendor/golang.org/x/net/html/node.go new file mode 100644 index 0000000..1350eef --- /dev/null +++ b/vendor/golang.org/x/net/html/node.go @@ -0,0 +1,225 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "golang.org/x/net/html/atom" +) + +// A NodeType is the type of a Node. +type NodeType uint32 + +const ( + ErrorNode NodeType = iota + TextNode + DocumentNode + ElementNode + CommentNode + DoctypeNode + // RawNode nodes are not returned by the parser, but can be part of the + // Node tree passed to func Render to insert raw HTML (without escaping). + // If so, this package makes no guarantee that the rendered HTML is secure + // (from e.g. Cross Site Scripting attacks) or well-formed. + RawNode + scopeMarkerNode +) + +// Section 12.2.4.3 says "The markers are inserted when entering applet, +// object, marquee, template, td, th, and caption elements, and are used +// to prevent formatting from "leaking" into applet, object, marquee, +// template, td, th, and caption elements". +var scopeMarker = Node{Type: scopeMarkerNode} + +// A Node consists of a NodeType and some Data (tag name for element nodes, +// content for text) and are part of a tree of Nodes. Element nodes may also +// have a Namespace and contain a slice of Attributes. Data is unescaped, so +// that it looks like "a 0 { + return (*s)[i-1] + } + return nil +} + +// index returns the index of the top-most occurrence of n in the stack, or -1 +// if n is not present. +func (s *nodeStack) index(n *Node) int { + for i := len(*s) - 1; i >= 0; i-- { + if (*s)[i] == n { + return i + } + } + return -1 +} + +// contains returns whether a is within s. +func (s *nodeStack) contains(a atom.Atom) bool { + for _, n := range *s { + if n.DataAtom == a && n.Namespace == "" { + return true + } + } + return false +} + +// insert inserts a node at the given index. +func (s *nodeStack) insert(i int, n *Node) { + (*s) = append(*s, nil) + copy((*s)[i+1:], (*s)[i:]) + (*s)[i] = n +} + +// remove removes a node from the stack. It is a no-op if n is not present. +func (s *nodeStack) remove(n *Node) { + i := s.index(n) + if i == -1 { + return + } + copy((*s)[i:], (*s)[i+1:]) + j := len(*s) - 1 + (*s)[j] = nil + *s = (*s)[:j] +} + +type insertionModeStack []insertionMode + +func (s *insertionModeStack) pop() (im insertionMode) { + i := len(*s) + im = (*s)[i-1] + *s = (*s)[:i-1] + return im +} + +func (s *insertionModeStack) top() insertionMode { + if i := len(*s); i > 0 { + return (*s)[i-1] + } + return nil +} diff --git a/vendor/golang.org/x/net/html/parse.go b/vendor/golang.org/x/net/html/parse.go new file mode 100644 index 0000000..f91466f --- /dev/null +++ b/vendor/golang.org/x/net/html/parse.go @@ -0,0 +1,2438 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "errors" + "fmt" + "io" + "strings" + + a "golang.org/x/net/html/atom" +) + +// A parser implements the HTML5 parsing algorithm: +// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction +type parser struct { + // tokenizer provides the tokens for the parser. + tokenizer *Tokenizer + // tok is the most recently read token. + tok Token + // Self-closing tags like are treated as start tags, except that + // hasSelfClosingToken is set while they are being processed. + hasSelfClosingToken bool + // doc is the document root element. + doc *Node + // The stack of open elements (section 12.2.4.2) and active formatting + // elements (section 12.2.4.3). + oe, afe nodeStack + // Element pointers (section 12.2.4.4). + head, form *Node + // Other parsing state flags (section 12.2.4.5). + scripting, framesetOK bool + // The stack of template insertion modes + templateStack insertionModeStack + // im is the current insertion mode. + im insertionMode + // originalIM is the insertion mode to go back to after completing a text + // or inTableText insertion mode. + originalIM insertionMode + // fosterParenting is whether new elements should be inserted according to + // the foster parenting rules (section 12.2.6.1). + fosterParenting bool + // quirks is whether the parser is operating in "quirks mode." + quirks bool + // fragment is whether the parser is parsing an HTML fragment. + fragment bool + // context is the context element when parsing an HTML fragment + // (section 12.4). + context *Node +} + +func (p *parser) top() *Node { + if n := p.oe.top(); n != nil { + return n + } + return p.doc +} + +// Stop tags for use in popUntil. These come from section 12.2.4.2. +var ( + defaultScopeStopTags = map[string][]a.Atom{ + "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, + "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, + "svg": {a.Desc, a.ForeignObject, a.Title}, + } +) + +type scope int + +const ( + defaultScope scope = iota + listItemScope + buttonScope + tableScope + tableRowScope + tableBodyScope + selectScope +) + +// popUntil pops the stack of open elements at the highest element whose tag +// is in matchTags, provided there is no higher element in the scope's stop +// tags (as defined in section 12.2.4.2). It returns whether or not there was +// such an element. If there was not, popUntil leaves the stack unchanged. +// +// For example, the set of stop tags for table scope is: "html", "table". If +// the stack was: +// ["html", "body", "font", "table", "b", "i", "u"] +// then popUntil(tableScope, "font") would return false, but +// popUntil(tableScope, "i") would return true and the stack would become: +// ["html", "body", "font", "table", "b"] +// +// If an element's tag is in both the stop tags and matchTags, then the stack +// will be popped and the function returns true (provided, of course, there was +// no higher element in the stack that was also in the stop tags). For example, +// popUntil(tableScope, "table") returns true and leaves: +// ["html", "body", "font"] +func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { + if i := p.indexOfElementInScope(s, matchTags...); i != -1 { + p.oe = p.oe[:i] + return true + } + return false +} + +// indexOfElementInScope returns the index in p.oe of the highest element whose +// tag is in matchTags that is in scope. If no matching element is in scope, it +// returns -1. +func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { + for i := len(p.oe) - 1; i >= 0; i-- { + tagAtom := p.oe[i].DataAtom + if p.oe[i].Namespace == "" { + for _, t := range matchTags { + if t == tagAtom { + return i + } + } + switch s { + case defaultScope: + // No-op. + case listItemScope: + if tagAtom == a.Ol || tagAtom == a.Ul { + return -1 + } + case buttonScope: + if tagAtom == a.Button { + return -1 + } + case tableScope: + if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { + return -1 + } + case selectScope: + if tagAtom != a.Optgroup && tagAtom != a.Option { + return -1 + } + default: + panic("unreachable") + } + } + switch s { + case defaultScope, listItemScope, buttonScope: + for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { + if t == tagAtom { + return -1 + } + } + } + } + return -1 +} + +// elementInScope is like popUntil, except that it doesn't modify the stack of +// open elements. +func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { + return p.indexOfElementInScope(s, matchTags...) != -1 +} + +// clearStackToContext pops elements off the stack of open elements until a +// scope-defined element is found. +func (p *parser) clearStackToContext(s scope) { + for i := len(p.oe) - 1; i >= 0; i-- { + tagAtom := p.oe[i].DataAtom + switch s { + case tableScope: + if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { + p.oe = p.oe[:i+1] + return + } + case tableRowScope: + if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template { + p.oe = p.oe[:i+1] + return + } + case tableBodyScope: + if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template { + p.oe = p.oe[:i+1] + return + } + default: + panic("unreachable") + } + } +} + +// parseGenericRawTextElements implements the generic raw text element parsing +// algorithm defined in 12.2.6.2. +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text +// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part +// officially, need to make tokenizer consider both states. +func (p *parser) parseGenericRawTextElement() { + p.addElement() + p.originalIM = p.im + p.im = textIM +} + +// generateImpliedEndTags pops nodes off the stack of open elements as long as +// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc. +// If exceptions are specified, nodes with that name will not be popped off. +func (p *parser) generateImpliedEndTags(exceptions ...string) { + var i int +loop: + for i = len(p.oe) - 1; i >= 0; i-- { + n := p.oe[i] + if n.Type != ElementNode { + break + } + switch n.DataAtom { + case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc: + for _, except := range exceptions { + if n.Data == except { + break loop + } + } + continue + } + break + } + + p.oe = p.oe[:i+1] +} + +// addChild adds a child node n to the top element, and pushes n onto the stack +// of open elements if it is an element node. +func (p *parser) addChild(n *Node) { + if p.shouldFosterParent() { + p.fosterParent(n) + } else { + p.top().AppendChild(n) + } + + if n.Type == ElementNode { + p.oe = append(p.oe, n) + } +} + +// shouldFosterParent returns whether the next node to be added should be +// foster parented. +func (p *parser) shouldFosterParent() bool { + if p.fosterParenting { + switch p.top().DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + return true + } + } + return false +} + +// fosterParent adds a child node according to the foster parenting rules. +// Section 12.2.6.1, "foster parenting". +func (p *parser) fosterParent(n *Node) { + var table, parent, prev, template *Node + var i int + for i = len(p.oe) - 1; i >= 0; i-- { + if p.oe[i].DataAtom == a.Table { + table = p.oe[i] + break + } + } + + var j int + for j = len(p.oe) - 1; j >= 0; j-- { + if p.oe[j].DataAtom == a.Template { + template = p.oe[j] + break + } + } + + if template != nil && (table == nil || j > i) { + template.AppendChild(n) + return + } + + if table == nil { + // The foster parent is the html element. + parent = p.oe[0] + } else { + parent = table.Parent + } + if parent == nil { + parent = p.oe[i-1] + } + + if table != nil { + prev = table.PrevSibling + } else { + prev = parent.LastChild + } + if prev != nil && prev.Type == TextNode && n.Type == TextNode { + prev.Data += n.Data + return + } + + parent.InsertBefore(n, table) +} + +// addText adds text to the preceding node if it is a text node, or else it +// calls addChild with a new text node. +func (p *parser) addText(text string) { + if text == "" { + return + } + + if p.shouldFosterParent() { + p.fosterParent(&Node{ + Type: TextNode, + Data: text, + }) + return + } + + t := p.top() + if n := t.LastChild; n != nil && n.Type == TextNode { + n.Data += text + return + } + p.addChild(&Node{ + Type: TextNode, + Data: text, + }) +} + +// addElement adds a child element based on the current token. +func (p *parser) addElement() { + p.addChild(&Node{ + Type: ElementNode, + DataAtom: p.tok.DataAtom, + Data: p.tok.Data, + Attr: p.tok.Attr, + }) +} + +// Section 12.2.4.3. +func (p *parser) addFormattingElement() { + tagAtom, attr := p.tok.DataAtom, p.tok.Attr + p.addElement() + + // Implement the Noah's Ark clause, but with three per family instead of two. + identicalElements := 0 +findIdenticalElements: + for i := len(p.afe) - 1; i >= 0; i-- { + n := p.afe[i] + if n.Type == scopeMarkerNode { + break + } + if n.Type != ElementNode { + continue + } + if n.Namespace != "" { + continue + } + if n.DataAtom != tagAtom { + continue + } + if len(n.Attr) != len(attr) { + continue + } + compareAttributes: + for _, t0 := range n.Attr { + for _, t1 := range attr { + if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { + // Found a match for this attribute, continue with the next attribute. + continue compareAttributes + } + } + // If we get here, there is no attribute that matches a. + // Therefore the element is not identical to the new one. + continue findIdenticalElements + } + + identicalElements++ + if identicalElements >= 3 { + p.afe.remove(n) + } + } + + p.afe = append(p.afe, p.top()) +} + +// Section 12.2.4.3. +func (p *parser) clearActiveFormattingElements() { + for { + if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode { + return + } + } +} + +// Section 12.2.4.3. +func (p *parser) reconstructActiveFormattingElements() { + n := p.afe.top() + if n == nil { + return + } + if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { + return + } + i := len(p.afe) - 1 + for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { + if i == 0 { + i = -1 + break + } + i-- + n = p.afe[i] + } + for { + i++ + clone := p.afe[i].clone() + p.addChild(clone) + p.afe[i] = clone + if i == len(p.afe)-1 { + break + } + } +} + +// Section 12.2.5. +func (p *parser) acknowledgeSelfClosingTag() { + p.hasSelfClosingToken = false +} + +// An insertion mode (section 12.2.4.1) is the state transition function from +// a particular state in the HTML5 parser's state machine. It updates the +// parser's fields depending on parser.tok (where ErrorToken means EOF). +// It returns whether the token was consumed. +type insertionMode func(*parser) bool + +// setOriginalIM sets the insertion mode to return to after completing a text or +// inTableText insertion mode. +// Section 12.2.4.1, "using the rules for". +func (p *parser) setOriginalIM() { + if p.originalIM != nil { + panic("html: bad parser state: originalIM was set twice") + } + p.originalIM = p.im +} + +// Section 12.2.4.1, "reset the insertion mode". +func (p *parser) resetInsertionMode() { + for i := len(p.oe) - 1; i >= 0; i-- { + n := p.oe[i] + last := i == 0 + if last && p.context != nil { + n = p.context + } + + switch n.DataAtom { + case a.Select: + if !last { + for ancestor, first := n, p.oe[0]; ancestor != first; { + ancestor = p.oe[p.oe.index(ancestor)-1] + switch ancestor.DataAtom { + case a.Template: + p.im = inSelectIM + return + case a.Table: + p.im = inSelectInTableIM + return + } + } + } + p.im = inSelectIM + case a.Td, a.Th: + // TODO: remove this divergence from the HTML5 spec. + // + // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 + p.im = inCellIM + case a.Tr: + p.im = inRowIM + case a.Tbody, a.Thead, a.Tfoot: + p.im = inTableBodyIM + case a.Caption: + p.im = inCaptionIM + case a.Colgroup: + p.im = inColumnGroupIM + case a.Table: + p.im = inTableIM + case a.Template: + // TODO: remove this divergence from the HTML5 spec. + if n.Namespace != "" { + continue + } + p.im = p.templateStack.top() + case a.Head: + // TODO: remove this divergence from the HTML5 spec. + // + // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 + p.im = inHeadIM + case a.Body: + p.im = inBodyIM + case a.Frameset: + p.im = inFramesetIM + case a.Html: + if p.head == nil { + p.im = beforeHeadIM + } else { + p.im = afterHeadIM + } + default: + if last { + p.im = inBodyIM + return + } + continue + } + return + } +} + +const whitespace = " \t\r\n\f" + +// Section 12.2.6.4.1. +func initialIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) + if len(p.tok.Data) == 0 { + // It was all whitespace, so ignore it. + return true + } + case CommentToken: + p.doc.AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + n, quirks := parseDoctype(p.tok.Data) + p.doc.AppendChild(n) + p.quirks = quirks + p.im = beforeHTMLIM + return true + } + p.quirks = true + p.im = beforeHTMLIM + return false +} + +// Section 12.2.6.4.2. +func beforeHTMLIM(p *parser) bool { + switch p.tok.Type { + case DoctypeToken: + // Ignore the token. + return true + case TextToken: + p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) + if len(p.tok.Data) == 0 { + // It was all whitespace, so ignore it. + return true + } + case StartTagToken: + if p.tok.DataAtom == a.Html { + p.addElement() + p.im = beforeHeadIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Head, a.Body, a.Html, a.Br: + p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) + return false + default: + // Ignore the token. + return true + } + case CommentToken: + p.doc.AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + } + p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) + return false +} + +// Section 12.2.6.4.3. +func beforeHeadIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) + if len(p.tok.Data) == 0 { + // It was all whitespace, so ignore it. + return true + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Head: + p.addElement() + p.head = p.top() + p.im = inHeadIM + return true + case a.Html: + return inBodyIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Head, a.Body, a.Html, a.Br: + p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) + return false + default: + // Ignore the token. + return true + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + // Ignore the token. + return true + } + + p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) + return false +} + +// Section 12.2.6.4.4. +func inHeadIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) < len(p.tok.Data) { + // Add the initial whitespace to the current node. + p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) + if s == "" { + return true + } + p.tok.Data = s + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + return true + case a.Noscript: + if p.scripting { + p.parseGenericRawTextElement() + return true + } + p.addElement() + p.im = inHeadNoscriptIM + // Don't let the tokenizer go into raw text mode when scripting is disabled. + p.tokenizer.NextIsNotRawText() + return true + case a.Script, a.Title: + p.addElement() + p.setOriginalIM() + p.im = textIM + return true + case a.Noframes, a.Style: + p.parseGenericRawTextElement() + return true + case a.Head: + // Ignore the token. + return true + case a.Template: + p.addElement() + p.afe = append(p.afe, &scopeMarker) + p.framesetOK = false + p.im = inTemplateIM + p.templateStack = append(p.templateStack, inTemplateIM) + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Head: + p.oe.pop() + p.im = afterHeadIM + return true + case a.Body, a.Html, a.Br: + p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) + return false + case a.Template: + if !p.oe.contains(a.Template) { + return true + } + // TODO: remove this divergence from the HTML5 spec. + // + // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 + p.generateImpliedEndTags() + for i := len(p.oe) - 1; i >= 0; i-- { + if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { + p.oe = p.oe[:i] + break + } + } + p.clearActiveFormattingElements() + p.templateStack.pop() + p.resetInsertionMode() + return true + default: + // Ignore the token. + return true + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + // Ignore the token. + return true + } + + p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) + return false +} + +// 12.2.6.4.5. +func inHeadNoscriptIM(p *parser) bool { + switch p.tok.Type { + case DoctypeToken: + // Ignore the token. + return true + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style: + return inHeadIM(p) + case a.Head: + // Ignore the token. + return true + case a.Noscript: + // Don't let the tokenizer go into raw text mode even when a + // tag is in "in head noscript" insertion mode. + p.tokenizer.NextIsNotRawText() + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Noscript, a.Br: + default: + // Ignore the token. + return true + } + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) == 0 { + // It was all whitespace. + return inHeadIM(p) + } + case CommentToken: + return inHeadIM(p) + } + p.oe.pop() + if p.top().DataAtom != a.Head { + panic("html: the new current node will be a head element.") + } + p.im = inHeadIM + if p.tok.DataAtom == a.Noscript { + return true + } + return false +} + +// Section 12.2.6.4.6. +func afterHeadIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) < len(p.tok.Data) { + // Add the initial whitespace to the current node. + p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) + if s == "" { + return true + } + p.tok.Data = s + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Body: + p.addElement() + p.framesetOK = false + p.im = inBodyIM + return true + case a.Frameset: + p.addElement() + p.im = inFramesetIM + return true + case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: + p.oe = append(p.oe, p.head) + defer p.oe.remove(p.head) + return inHeadIM(p) + case a.Head: + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Body, a.Html, a.Br: + // Drop down to creating an implied tag. + case a.Template: + return inHeadIM(p) + default: + // Ignore the token. + return true + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + // Ignore the token. + return true + } + + p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) + p.framesetOK = true + return false +} + +// copyAttributes copies attributes of src not found on dst to dst. +func copyAttributes(dst *Node, src Token) { + if len(src.Attr) == 0 { + return + } + attr := map[string]string{} + for _, t := range dst.Attr { + attr[t.Key] = t.Val + } + for _, t := range src.Attr { + if _, ok := attr[t.Key]; !ok { + dst.Attr = append(dst.Attr, t) + attr[t.Key] = t.Val + } + } +} + +// Section 12.2.6.4.7. +func inBodyIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + d := p.tok.Data + switch n := p.oe.top(); n.DataAtom { + case a.Pre, a.Listing: + if n.FirstChild == nil { + // Ignore a newline at the start of a block. + if d != "" && d[0] == '\r' { + d = d[1:] + } + if d != "" && d[0] == '\n' { + d = d[1:] + } + } + } + d = strings.Replace(d, "\x00", "", -1) + if d == "" { + return true + } + p.reconstructActiveFormattingElements() + p.addText(d) + if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { + // There were non-whitespace characters inserted. + p.framesetOK = false + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + if p.oe.contains(a.Template) { + return true + } + copyAttributes(p.oe[0], p.tok) + case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: + return inHeadIM(p) + case a.Body: + if p.oe.contains(a.Template) { + return true + } + if len(p.oe) >= 2 { + body := p.oe[1] + if body.Type == ElementNode && body.DataAtom == a.Body { + p.framesetOK = false + copyAttributes(body, p.tok) + } + } + case a.Frameset: + if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { + // Ignore the token. + return true + } + body := p.oe[1] + if body.Parent != nil { + body.Parent.RemoveChild(body) + } + p.oe = p.oe[:1] + p.addElement() + p.im = inFramesetIM + return true + case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: + p.popUntil(buttonScope, a.P) + p.addElement() + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.popUntil(buttonScope, a.P) + switch n := p.top(); n.DataAtom { + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.oe.pop() + } + p.addElement() + case a.Pre, a.Listing: + p.popUntil(buttonScope, a.P) + p.addElement() + // The newline, if any, will be dealt with by the TextToken case. + p.framesetOK = false + case a.Form: + if p.form != nil && !p.oe.contains(a.Template) { + // Ignore the token + return true + } + p.popUntil(buttonScope, a.P) + p.addElement() + if !p.oe.contains(a.Template) { + p.form = p.top() + } + case a.Li: + p.framesetOK = false + for i := len(p.oe) - 1; i >= 0; i-- { + node := p.oe[i] + switch node.DataAtom { + case a.Li: + p.oe = p.oe[:i] + case a.Address, a.Div, a.P: + continue + default: + if !isSpecialElement(node) { + continue + } + } + break + } + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Dd, a.Dt: + p.framesetOK = false + for i := len(p.oe) - 1; i >= 0; i-- { + node := p.oe[i] + switch node.DataAtom { + case a.Dd, a.Dt: + p.oe = p.oe[:i] + case a.Address, a.Div, a.P: + continue + default: + if !isSpecialElement(node) { + continue + } + } + break + } + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Plaintext: + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Button: + p.popUntil(defaultScope, a.Button) + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + case a.A: + for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { + if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { + p.inBodyEndTagFormatting(a.A, "a") + p.oe.remove(n) + p.afe.remove(n) + break + } + } + p.reconstructActiveFormattingElements() + p.addFormattingElement() + case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: + p.reconstructActiveFormattingElements() + p.addFormattingElement() + case a.Nobr: + p.reconstructActiveFormattingElements() + if p.elementInScope(defaultScope, a.Nobr) { + p.inBodyEndTagFormatting(a.Nobr, "nobr") + p.reconstructActiveFormattingElements() + } + p.addFormattingElement() + case a.Applet, a.Marquee, a.Object: + p.reconstructActiveFormattingElements() + p.addElement() + p.afe = append(p.afe, &scopeMarker) + p.framesetOK = false + case a.Table: + if !p.quirks { + p.popUntil(buttonScope, a.P) + } + p.addElement() + p.framesetOK = false + p.im = inTableIM + return true + case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: + p.reconstructActiveFormattingElements() + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + if p.tok.DataAtom == a.Input { + for _, t := range p.tok.Attr { + if t.Key == "type" { + if strings.ToLower(t.Val) == "hidden" { + // Skip setting framesetOK = false + return true + } + } + } + } + p.framesetOK = false + case a.Param, a.Source, a.Track: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + case a.Hr: + p.popUntil(buttonScope, a.P) + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + p.framesetOK = false + case a.Image: + p.tok.DataAtom = a.Img + p.tok.Data = a.Img.String() + return false + case a.Textarea: + p.addElement() + p.setOriginalIM() + p.framesetOK = false + p.im = textIM + case a.Xmp: + p.popUntil(buttonScope, a.P) + p.reconstructActiveFormattingElements() + p.framesetOK = false + p.parseGenericRawTextElement() + case a.Iframe: + p.framesetOK = false + p.parseGenericRawTextElement() + case a.Noembed: + p.parseGenericRawTextElement() + case a.Noscript: + if p.scripting { + p.parseGenericRawTextElement() + return true + } + p.reconstructActiveFormattingElements() + p.addElement() + // Don't let the tokenizer go into raw text mode when scripting is disabled. + p.tokenizer.NextIsNotRawText() + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectIM + return true + case a.Optgroup, a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + p.reconstructActiveFormattingElements() + p.addElement() + case a.Rb, a.Rtc: + if p.elementInScope(defaultScope, a.Ruby) { + p.generateImpliedEndTags() + } + p.addElement() + case a.Rp, a.Rt: + if p.elementInScope(defaultScope, a.Ruby) { + p.generateImpliedEndTags("rtc") + } + p.addElement() + case a.Math, a.Svg: + p.reconstructActiveFormattingElements() + if p.tok.DataAtom == a.Math { + adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) + } else { + adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) + } + adjustForeignAttributes(p.tok.Attr) + p.addElement() + p.top().Namespace = p.tok.Data + if p.hasSelfClosingToken { + p.oe.pop() + p.acknowledgeSelfClosingTag() + } + return true + case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + default: + p.reconstructActiveFormattingElements() + p.addElement() + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Body: + if p.elementInScope(defaultScope, a.Body) { + p.im = afterBodyIM + } + case a.Html: + if p.elementInScope(defaultScope, a.Body) { + p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) + return false + } + return true + case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: + p.popUntil(defaultScope, p.tok.DataAtom) + case a.Form: + if p.oe.contains(a.Template) { + i := p.indexOfElementInScope(defaultScope, a.Form) + if i == -1 { + // Ignore the token. + return true + } + p.generateImpliedEndTags() + if p.oe[i].DataAtom != a.Form { + // Ignore the token. + return true + } + p.popUntil(defaultScope, a.Form) + } else { + node := p.form + p.form = nil + i := p.indexOfElementInScope(defaultScope, a.Form) + if node == nil || i == -1 || p.oe[i] != node { + // Ignore the token. + return true + } + p.generateImpliedEndTags() + p.oe.remove(node) + } + case a.P: + if !p.elementInScope(buttonScope, a.P) { + p.parseImpliedToken(StartTagToken, a.P, a.P.String()) + } + p.popUntil(buttonScope, a.P) + case a.Li: + p.popUntil(listItemScope, a.Li) + case a.Dd, a.Dt: + p.popUntil(defaultScope, p.tok.DataAtom) + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) + case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: + p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data) + case a.Applet, a.Marquee, a.Object: + if p.popUntil(defaultScope, p.tok.DataAtom) { + p.clearActiveFormattingElements() + } + case a.Br: + p.tok.Type = StartTagToken + return false + case a.Template: + return inHeadIM(p) + default: + p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case ErrorToken: + // TODO: remove this divergence from the HTML5 spec. + if len(p.templateStack) > 0 { + p.im = inTemplateIM + return false + } + for _, e := range p.oe { + switch e.DataAtom { + case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th, + a.Thead, a.Tr, a.Body, a.Html: + default: + return true + } + } + } + + return true +} + +func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) { + // This is the "adoption agency" algorithm, described at + // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency + + // TODO: this is a fairly literal line-by-line translation of that algorithm. + // Once the code successfully parses the comprehensive test suite, we should + // refactor this code to be more idiomatic. + + // Steps 1-2 + if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 { + p.oe.pop() + return + } + + // Steps 3-5. The outer loop. + for i := 0; i < 8; i++ { + // Step 6. Find the formatting element. + var formattingElement *Node + for j := len(p.afe) - 1; j >= 0; j-- { + if p.afe[j].Type == scopeMarkerNode { + break + } + if p.afe[j].DataAtom == tagAtom { + formattingElement = p.afe[j] + break + } + } + if formattingElement == nil { + p.inBodyEndTagOther(tagAtom, tagName) + return + } + + // Step 7. Ignore the tag if formatting element is not in the stack of open elements. + feIndex := p.oe.index(formattingElement) + if feIndex == -1 { + p.afe.remove(formattingElement) + return + } + // Step 8. Ignore the tag if formatting element is not in the scope. + if !p.elementInScope(defaultScope, tagAtom) { + // Ignore the tag. + return + } + + // Step 9. This step is omitted because it's just a parse error but no need to return. + + // Steps 10-11. Find the furthest block. + var furthestBlock *Node + for _, e := range p.oe[feIndex:] { + if isSpecialElement(e) { + furthestBlock = e + break + } + } + if furthestBlock == nil { + e := p.oe.pop() + for e != formattingElement { + e = p.oe.pop() + } + p.afe.remove(e) + return + } + + // Steps 12-13. Find the common ancestor and bookmark node. + commonAncestor := p.oe[feIndex-1] + bookmark := p.afe.index(formattingElement) + + // Step 14. The inner loop. Find the lastNode to reparent. + lastNode := furthestBlock + node := furthestBlock + x := p.oe.index(node) + // Step 14.1. + j := 0 + for { + // Step 14.2. + j++ + // Step. 14.3. + x-- + node = p.oe[x] + // Step 14.4. Go to the next step if node is formatting element. + if node == formattingElement { + break + } + // Step 14.5. Remove node from the list of active formatting elements if + // inner loop counter is greater than three and node is in the list of + // active formatting elements. + if ni := p.afe.index(node); j > 3 && ni > -1 { + p.afe.remove(node) + // If any element of the list of active formatting elements is removed, + // we need to take care whether bookmark should be decremented or not. + // This is because the value of bookmark may exceed the size of the + // list by removing elements from the list. + if ni <= bookmark { + bookmark-- + } + continue + } + // Step 14.6. Continue the next inner loop if node is not in the list of + // active formatting elements. + if p.afe.index(node) == -1 { + p.oe.remove(node) + continue + } + // Step 14.7. + clone := node.clone() + p.afe[p.afe.index(node)] = clone + p.oe[p.oe.index(node)] = clone + node = clone + // Step 14.8. + if lastNode == furthestBlock { + bookmark = p.afe.index(node) + 1 + } + // Step 14.9. + if lastNode.Parent != nil { + lastNode.Parent.RemoveChild(lastNode) + } + node.AppendChild(lastNode) + // Step 14.10. + lastNode = node + } + + // Step 15. Reparent lastNode to the common ancestor, + // or for misnested table nodes, to the foster parent. + if lastNode.Parent != nil { + lastNode.Parent.RemoveChild(lastNode) + } + switch commonAncestor.DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + p.fosterParent(lastNode) + default: + commonAncestor.AppendChild(lastNode) + } + + // Steps 16-18. Reparent nodes from the furthest block's children + // to a clone of the formatting element. + clone := formattingElement.clone() + reparentChildren(clone, furthestBlock) + furthestBlock.AppendChild(clone) + + // Step 19. Fix up the list of active formatting elements. + if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { + // Move the bookmark with the rest of the list. + bookmark-- + } + p.afe.remove(formattingElement) + p.afe.insert(bookmark, clone) + + // Step 20. Fix up the stack of open elements. + p.oe.remove(formattingElement) + p.oe.insert(p.oe.index(furthestBlock)+1, clone) + } +} + +// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. +// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content +// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign +func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) { + for i := len(p.oe) - 1; i >= 0; i-- { + // Two element nodes have the same tag if they have the same Data (a + // string-typed field). As an optimization, for common HTML tags, each + // Data string is assigned a unique, non-zero DataAtom (a uint32-typed + // field), since integer comparison is faster than string comparison. + // Uncommon (custom) tags get a zero DataAtom. + // + // The if condition here is equivalent to (p.oe[i].Data == tagName). + if (p.oe[i].DataAtom == tagAtom) && + ((tagAtom != 0) || (p.oe[i].Data == tagName)) { + p.oe = p.oe[:i] + break + } + if isSpecialElement(p.oe[i]) { + break + } + } +} + +// Section 12.2.6.4.8. +func textIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + p.oe.pop() + case TextToken: + d := p.tok.Data + if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { + // Ignore a newline at the start of a block. + if d != "" && d[0] == '\r' { + d = d[1:] + } + if d != "" && d[0] == '\n' { + d = d[1:] + } + } + if d == "" { + return true + } + p.addText(d) + return true + case EndTagToken: + p.oe.pop() + } + p.im = p.originalIM + p.originalIM = nil + return p.tok.Type == EndTagToken +} + +// Section 12.2.6.4.9. +func inTableIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) + switch p.oe.top().DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + if strings.Trim(p.tok.Data, whitespace) == "" { + p.addText(p.tok.Data) + return true + } + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Caption: + p.clearStackToContext(tableScope) + p.afe = append(p.afe, &scopeMarker) + p.addElement() + p.im = inCaptionIM + return true + case a.Colgroup: + p.clearStackToContext(tableScope) + p.addElement() + p.im = inColumnGroupIM + return true + case a.Col: + p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) + return false + case a.Tbody, a.Tfoot, a.Thead: + p.clearStackToContext(tableScope) + p.addElement() + p.im = inTableBodyIM + return true + case a.Td, a.Th, a.Tr: + p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) + return false + case a.Table: + if p.popUntil(tableScope, a.Table) { + p.resetInsertionMode() + return false + } + // Ignore the token. + return true + case a.Style, a.Script, a.Template: + return inHeadIM(p) + case a.Input: + for _, t := range p.tok.Attr { + if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { + p.addElement() + p.oe.pop() + return true + } + } + // Otherwise drop down to the default action. + case a.Form: + if p.oe.contains(a.Template) || p.form != nil { + // Ignore the token. + return true + } + p.addElement() + p.form = p.oe.pop() + case a.Select: + p.reconstructActiveFormattingElements() + switch p.top().DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + p.fosterParenting = true + } + p.addElement() + p.fosterParenting = false + p.framesetOK = false + p.im = inSelectInTableIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Table: + if p.popUntil(tableScope, a.Table) { + p.resetInsertionMode() + return true + } + // Ignore the token. + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + return true + case a.Template: + return inHeadIM(p) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + // Ignore the token. + return true + case ErrorToken: + return inBodyIM(p) + } + + p.fosterParenting = true + defer func() { p.fosterParenting = false }() + + return inBodyIM(p) +} + +// Section 12.2.6.4.11. +func inCaptionIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: + if !p.popUntil(tableScope, a.Caption) { + // Ignore the token. + return true + } + p.clearActiveFormattingElements() + p.im = inTableIM + return false + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectInTableIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Caption: + if p.popUntil(tableScope, a.Caption) { + p.clearActiveFormattingElements() + p.im = inTableIM + } + return true + case a.Table: + if !p.popUntil(tableScope, a.Caption) { + // Ignore the token. + return true + } + p.clearActiveFormattingElements() + p.im = inTableIM + return false + case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + return true + } + } + return inBodyIM(p) +} + +// Section 12.2.6.4.12. +func inColumnGroupIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) < len(p.tok.Data) { + // Add the initial whitespace to the current node. + p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) + if s == "" { + return true + } + p.tok.Data = s + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + // Ignore the token. + return true + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Col: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + return true + case a.Template: + return inHeadIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Colgroup: + if p.oe.top().DataAtom == a.Colgroup { + p.oe.pop() + p.im = inTableIM + } + return true + case a.Col: + // Ignore the token. + return true + case a.Template: + return inHeadIM(p) + } + case ErrorToken: + return inBodyIM(p) + } + if p.oe.top().DataAtom != a.Colgroup { + return true + } + p.oe.pop() + p.im = inTableIM + return false +} + +// Section 12.2.6.4.13. +func inTableBodyIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Tr: + p.clearStackToContext(tableBodyScope) + p.addElement() + p.im = inRowIM + return true + case a.Td, a.Th: + p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) + return false + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: + if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { + p.im = inTableIM + return false + } + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Tbody, a.Tfoot, a.Thead: + if p.elementInScope(tableScope, p.tok.DataAtom) { + p.clearStackToContext(tableBodyScope) + p.oe.pop() + p.im = inTableIM + } + return true + case a.Table: + if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { + p.im = inTableIM + return false + } + // Ignore the token. + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: + // Ignore the token. + return true + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + } + + return inTableIM(p) +} + +// Section 12.2.6.4.14. +func inRowIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Td, a.Th: + p.clearStackToContext(tableRowScope) + p.addElement() + p.afe = append(p.afe, &scopeMarker) + p.im = inCellIM + return true + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: + if p.popUntil(tableScope, a.Tr) { + p.im = inTableBodyIM + return false + } + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Tr: + if p.popUntil(tableScope, a.Tr) { + p.im = inTableBodyIM + return true + } + // Ignore the token. + return true + case a.Table: + if p.popUntil(tableScope, a.Tr) { + p.im = inTableBodyIM + return false + } + // Ignore the token. + return true + case a.Tbody, a.Tfoot, a.Thead: + if p.elementInScope(tableScope, p.tok.DataAtom) { + p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) + return false + } + // Ignore the token. + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: + // Ignore the token. + return true + } + } + + return inTableIM(p) +} + +// Section 12.2.6.4.15. +func inCellIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + if p.popUntil(tableScope, a.Td, a.Th) { + // Close the cell and reprocess. + p.clearActiveFormattingElements() + p.im = inRowIM + return false + } + // Ignore the token. + return true + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectInTableIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Td, a.Th: + if !p.popUntil(tableScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + p.clearActiveFormattingElements() + p.im = inRowIM + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: + // Ignore the token. + return true + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + if !p.elementInScope(tableScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + // Close the cell and reprocess. + if p.popUntil(tableScope, a.Td, a.Th) { + p.clearActiveFormattingElements() + } + p.im = inRowIM + return false + } + } + return inBodyIM(p) +} + +// Section 12.2.6.4.16. +func inSelectIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + p.addElement() + case a.Optgroup: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + if p.top().DataAtom == a.Optgroup { + p.oe.pop() + } + p.addElement() + case a.Select: + if !p.popUntil(selectScope, a.Select) { + // Ignore the token. + return true + } + p.resetInsertionMode() + case a.Input, a.Keygen, a.Textarea: + if p.elementInScope(selectScope, a.Select) { + p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) + return false + } + // In order to properly ignore , we need to change the tokenizer mode. + p.tokenizer.NextIsNotRawText() + // Ignore the token. + return true + case a.Script, a.Template: + return inHeadIM(p) + case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp: + // Don't let the tokenizer go into raw text mode when there are raw tags + // to be ignored. These tags should be ignored from the tokenizer + // properly. + p.tokenizer.NextIsNotRawText() + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + case a.Optgroup: + i := len(p.oe) - 1 + if p.oe[i].DataAtom == a.Option { + i-- + } + if p.oe[i].DataAtom == a.Optgroup { + p.oe = p.oe[:i] + } + case a.Select: + if !p.popUntil(selectScope, a.Select) { + // Ignore the token. + return true + } + p.resetInsertionMode() + case a.Template: + return inHeadIM(p) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case DoctypeToken: + // Ignore the token. + return true + case ErrorToken: + return inBodyIM(p) + } + + return true +} + +// Section 12.2.6.4.17. +func inSelectInTableIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken, EndTagToken: + switch p.tok.DataAtom { + case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: + if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + // This is like p.popUntil(selectScope, a.Select), but it also + // matches , not just . Matching the MathML + // tag is arguably incorrect (conceptually), but it mimics what + // Chromium does. + for i := len(p.oe) - 1; i >= 0; i-- { + if n := p.oe[i]; n.DataAtom == a.Select { + p.oe = p.oe[:i] + break + } + } + p.resetInsertionMode() + return false + } + } + return inSelectIM(p) +} + +// Section 12.2.6.4.18. +func inTemplateIM(p *parser) bool { + switch p.tok.Type { + case TextToken, CommentToken, DoctypeToken: + return inBodyIM(p) + case StartTagToken: + switch p.tok.DataAtom { + case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: + return inHeadIM(p) + case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inTableIM) + p.im = inTableIM + return false + case a.Col: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inColumnGroupIM) + p.im = inColumnGroupIM + return false + case a.Tr: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inTableBodyIM) + p.im = inTableBodyIM + return false + case a.Td, a.Th: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inRowIM) + p.im = inRowIM + return false + default: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inBodyIM) + p.im = inBodyIM + return false + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Template: + return inHeadIM(p) + default: + // Ignore the token. + return true + } + case ErrorToken: + if !p.oe.contains(a.Template) { + // Ignore the token. + return true + } + // TODO: remove this divergence from the HTML5 spec. + // + // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 + p.generateImpliedEndTags() + for i := len(p.oe) - 1; i >= 0; i-- { + if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { + p.oe = p.oe[:i] + break + } + } + p.clearActiveFormattingElements() + p.templateStack.pop() + p.resetInsertionMode() + return false + } + return false +} + +// Section 12.2.6.4.19. +func afterBodyIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + // Stop parsing. + return true + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) == 0 { + // It was all whitespace. + return inBodyIM(p) + } + case StartTagToken: + if p.tok.DataAtom == a.Html { + return inBodyIM(p) + } + case EndTagToken: + if p.tok.DataAtom == a.Html { + if !p.fragment { + p.im = afterAfterBodyIM + } + return true + } + case CommentToken: + // The comment is attached to the element. + if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { + panic("html: bad parser state: element not found, in the after-body insertion mode") + } + p.oe[0].AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + } + p.im = inBodyIM + return false +} + +// Section 12.2.6.4.20. +func inFramesetIM(p *parser) bool { + switch p.tok.Type { + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case TextToken: + // Ignore all text but whitespace. + s := strings.Map(func(c rune) rune { + switch c { + case ' ', '\t', '\n', '\f', '\r': + return c + } + return -1 + }, p.tok.Data) + if s != "" { + p.addText(s) + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Frameset: + p.addElement() + case a.Frame: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + case a.Noframes: + return inHeadIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Frameset: + if p.oe.top().DataAtom != a.Html { + p.oe.pop() + if p.oe.top().DataAtom != a.Frameset { + p.im = afterFramesetIM + return true + } + } + } + default: + // Ignore the token. + } + return true +} + +// Section 12.2.6.4.21. +func afterFramesetIM(p *parser) bool { + switch p.tok.Type { + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case TextToken: + // Ignore all text but whitespace. + s := strings.Map(func(c rune) rune { + switch c { + case ' ', '\t', '\n', '\f', '\r': + return c + } + return -1 + }, p.tok.Data) + if s != "" { + p.addText(s) + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Noframes: + return inHeadIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Html: + p.im = afterAfterFramesetIM + return true + } + default: + // Ignore the token. + } + return true +} + +// Section 12.2.6.4.22. +func afterAfterBodyIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + // Stop parsing. + return true + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) == 0 { + // It was all whitespace. + return inBodyIM(p) + } + case StartTagToken: + if p.tok.DataAtom == a.Html { + return inBodyIM(p) + } + case CommentToken: + p.doc.AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + return inBodyIM(p) + } + p.im = inBodyIM + return false +} + +// Section 12.2.6.4.23. +func afterAfterFramesetIM(p *parser) bool { + switch p.tok.Type { + case CommentToken: + p.doc.AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case TextToken: + // Ignore all text but whitespace. + s := strings.Map(func(c rune) rune { + switch c { + case ' ', '\t', '\n', '\f', '\r': + return c + } + return -1 + }, p.tok.Data) + if s != "" { + p.tok.Data = s + return inBodyIM(p) + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Noframes: + return inHeadIM(p) + } + case DoctypeToken: + return inBodyIM(p) + default: + // Ignore the token. + } + return true +} + +const whitespaceOrNUL = whitespace + "\x00" + +// Section 12.2.6.5 +func parseForeignContent(p *parser) bool { + switch p.tok.Type { + case TextToken: + if p.framesetOK { + p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" + } + p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) + p.addText(p.tok.Data) + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case StartTagToken: + if !p.fragment { + b := breakout[p.tok.Data] + if p.tok.DataAtom == a.Font { + loop: + for _, attr := range p.tok.Attr { + switch attr.Key { + case "color", "face", "size": + b = true + break loop + } + } + } + if b { + for i := len(p.oe) - 1; i >= 0; i-- { + n := p.oe[i] + if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { + p.oe = p.oe[:i+1] + break + } + } + return false + } + } + current := p.adjustedCurrentNode() + switch current.Namespace { + case "math": + adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) + case "svg": + // Adjust SVG tag names. The tokenizer lower-cases tag names, but + // SVG wants e.g. "foreignObject" with a capital second "O". + if x := svgTagNameAdjustments[p.tok.Data]; x != "" { + p.tok.DataAtom = a.Lookup([]byte(x)) + p.tok.Data = x + } + adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) + default: + panic("html: bad parser state: unexpected namespace") + } + adjustForeignAttributes(p.tok.Attr) + namespace := current.Namespace + p.addElement() + p.top().Namespace = namespace + if namespace != "" { + // Don't let the tokenizer go into raw text mode in foreign content + // (e.g. in an SVG tag). + p.tokenizer.NextIsNotRawText() + } + if p.hasSelfClosingToken { + p.oe.pop() + p.acknowledgeSelfClosingTag() + } + case EndTagToken: + for i := len(p.oe) - 1; i >= 0; i-- { + if p.oe[i].Namespace == "" { + return p.im(p) + } + if strings.EqualFold(p.oe[i].Data, p.tok.Data) { + p.oe = p.oe[:i] + break + } + } + return true + default: + // Ignore the token. + } + return true +} + +// Section 12.2.4.2. +func (p *parser) adjustedCurrentNode() *Node { + if len(p.oe) == 1 && p.fragment && p.context != nil { + return p.context + } + return p.oe.top() +} + +// Section 12.2.6. +func (p *parser) inForeignContent() bool { + if len(p.oe) == 0 { + return false + } + n := p.adjustedCurrentNode() + if n.Namespace == "" { + return false + } + if mathMLTextIntegrationPoint(n) { + if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { + return false + } + if p.tok.Type == TextToken { + return false + } + } + if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { + return false + } + if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { + return false + } + if p.tok.Type == ErrorToken { + return false + } + return true +} + +// parseImpliedToken parses a token as though it had appeared in the parser's +// input. +func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { + realToken, selfClosing := p.tok, p.hasSelfClosingToken + p.tok = Token{ + Type: t, + DataAtom: dataAtom, + Data: data, + } + p.hasSelfClosingToken = false + p.parseCurrentToken() + p.tok, p.hasSelfClosingToken = realToken, selfClosing +} + +// parseCurrentToken runs the current token through the parsing routines +// until it is consumed. +func (p *parser) parseCurrentToken() { + if p.tok.Type == SelfClosingTagToken { + p.hasSelfClosingToken = true + p.tok.Type = StartTagToken + } + + consumed := false + for !consumed { + if p.inForeignContent() { + consumed = parseForeignContent(p) + } else { + consumed = p.im(p) + } + } + + if p.hasSelfClosingToken { + // This is a parse error, but ignore it. + p.hasSelfClosingToken = false + } +} + +func (p *parser) parse() error { + // Iterate until EOF. Any other error will cause an early return. + var err error + for err != io.EOF { + // CDATA sections are allowed only in foreign content. + n := p.oe.top() + p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") + // Read and parse the next token. + p.tokenizer.Next() + p.tok = p.tokenizer.Token() + if p.tok.Type == ErrorToken { + err = p.tokenizer.Err() + if err != nil && err != io.EOF { + return err + } + } + p.parseCurrentToken() + } + return nil +} + +// Parse returns the parse tree for the HTML from the given Reader. +// +// It implements the HTML5 parsing algorithm +// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction), +// which is very complicated. The resultant tree can contain implicitly created +// nodes that have no explicit listed in r's data, and nodes' parents can +// differ from the nesting implied by a naive processing of start and end +// s. Conversely, explicit s in r's data can be silently dropped, +// with no corresponding node in the resulting tree. +// +// The input is assumed to be UTF-8 encoded. +func Parse(r io.Reader) (*Node, error) { + return ParseWithOptions(r) +} + +// ParseFragment parses a fragment of HTML and returns the nodes that were +// found. If the fragment is the InnerHTML for an existing element, pass that +// element in context. +// +// It has the same intricacies as Parse. +func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { + return ParseFragmentWithOptions(r, context) +} + +// ParseOption configures a parser. +type ParseOption func(p *parser) + +// ParseOptionEnableScripting configures the scripting flag. +// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting +// +// By default, scripting is enabled. +func ParseOptionEnableScripting(enable bool) ParseOption { + return func(p *parser) { + p.scripting = enable + } +} + +// ParseWithOptions is like Parse, with options. +func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) { + p := &parser{ + tokenizer: NewTokenizer(r), + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + framesetOK: true, + im: initialIM, + } + + for _, f := range opts { + f(p) + } + + if err := p.parse(); err != nil { + return nil, err + } + return p.doc, nil +} + +// ParseFragmentWithOptions is like ParseFragment, with options. +func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) { + contextTag := "" + if context != nil { + if context.Type != ElementNode { + return nil, errors.New("html: ParseFragment of non-element Node") + } + // The next check isn't just context.DataAtom.String() == context.Data because + // it is valid to pass an element whose tag isn't a known atom. For example, + // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. + if context.DataAtom != a.Lookup([]byte(context.Data)) { + return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) + } + contextTag = context.DataAtom.String() + } + p := &parser{ + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + fragment: true, + context: context, + } + if context != nil && context.Namespace != "" { + p.tokenizer = NewTokenizer(r) + } else { + p.tokenizer = NewTokenizerFragment(r, contextTag) + } + + for _, f := range opts { + f(p) + } + + root := &Node{ + Type: ElementNode, + DataAtom: a.Html, + Data: a.Html.String(), + } + p.doc.AppendChild(root) + p.oe = nodeStack{root} + if context != nil && context.DataAtom == a.Template { + p.templateStack = append(p.templateStack, inTemplateIM) + } + p.resetInsertionMode() + + for n := context; n != nil; n = n.Parent { + if n.Type == ElementNode && n.DataAtom == a.Form { + p.form = n + break + } + } + + if err := p.parse(); err != nil { + return nil, err + } + + parent := p.doc + if context != nil { + parent = root + } + + var result []*Node + for c := parent.FirstChild; c != nil; { + next := c.NextSibling + parent.RemoveChild(c) + result = append(result, c) + c = next + } + return result, nil +} diff --git a/vendor/golang.org/x/net/html/render.go b/vendor/golang.org/x/net/html/render.go new file mode 100644 index 0000000..b46d81c --- /dev/null +++ b/vendor/golang.org/x/net/html/render.go @@ -0,0 +1,273 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" +) + +type writer interface { + io.Writer + io.ByteWriter + WriteString(string) (int, error) +} + +// Render renders the parse tree n to the given writer. +// +// Rendering is done on a 'best effort' basis: calling Parse on the output of +// Render will always result in something similar to the original tree, but it +// is not necessarily an exact clone unless the original tree was 'well-formed'. +// 'Well-formed' is not easily specified; the HTML5 specification is +// complicated. +// +// Calling Parse on arbitrary input typically results in a 'well-formed' parse +// tree. However, it is possible for Parse to yield a 'badly-formed' parse tree. +// For example, in a 'well-formed' parse tree, no element is a child of +// another element: parsing "" results in two sibling elements. +// Similarly, in a 'well-formed' parse tree, no element is a child of a +// element: parsing "" results in a with two sibling +// children; the is reparented to the 's parent. However, calling +// Parse on "" does not return an error, but the result has an +// element with an child, and is therefore not 'well-formed'. +// +// Programmatically constructed trees are typically also 'well-formed', but it +// is possible to construct a tree that looks innocuous but, when rendered and +// re-parsed, results in a different tree. A simple example is that a solitary +// text node would become a tree containing , and elements. +// Another example is that the programmatic equivalent of "abc" +// becomes "abc". +func Render(w io.Writer, n *Node) error { + if x, ok := w.(writer); ok { + return render(x, n) + } + buf := bufio.NewWriter(w) + if err := render(buf, n); err != nil { + return err + } + return buf.Flush() +} + +// plaintextAbort is returned from render1 when a element +// has been rendered. No more end tags should be rendered after that. +var plaintextAbort = errors.New("html: internal error (plaintext abort)") + +func render(w writer, n *Node) error { + err := render1(w, n) + if err == plaintextAbort { + err = nil + } + return err +} + +func render1(w writer, n *Node) error { + // Render non-element nodes; these are the easy cases. + switch n.Type { + case ErrorNode: + return errors.New("html: cannot render an ErrorNode node") + case TextNode: + return escape(w, n.Data) + case DocumentNode: + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := render1(w, c); err != nil { + return err + } + } + return nil + case ElementNode: + // No-op. + case CommentNode: + if _, err := w.WriteString(""); err != nil { + return err + } + return nil + case DoctypeNode: + if _, err := w.WriteString("') + case RawNode: + _, err := w.WriteString(n.Data) + return err + default: + return errors.New("html: unknown node type") + } + + // Render the opening tag. + if err := w.WriteByte('<'); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + for _, a := range n.Attr { + if err := w.WriteByte(' '); err != nil { + return err + } + if a.Namespace != "" { + if _, err := w.WriteString(a.Namespace); err != nil { + return err + } + if err := w.WriteByte(':'); err != nil { + return err + } + } + if _, err := w.WriteString(a.Key); err != nil { + return err + } + if _, err := w.WriteString(`="`); err != nil { + return err + } + if err := escape(w, a.Val); err != nil { + return err + } + if err := w.WriteByte('"'); err != nil { + return err + } + } + if voidElements[n.Data] { + if n.FirstChild != nil { + return fmt.Errorf("html: void element <%s> has child nodes", n.Data) + } + _, err := w.WriteString("/>") + return err + } + if err := w.WriteByte('>'); err != nil { + return err + } + + // Add initial newline where there is danger of a newline beging ignored. + if c := n.FirstChild; c != nil && c.Type == TextNode && strings.HasPrefix(c.Data, "\n") { + switch n.Data { + case "pre", "listing", "textarea": + if err := w.WriteByte('\n'); err != nil { + return err + } + } + } + + // Render any child nodes. + switch n.Data { + case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "xmp": + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == TextNode { + if _, err := w.WriteString(c.Data); err != nil { + return err + } + } else { + if err := render1(w, c); err != nil { + return err + } + } + } + if n.Data == "plaintext" { + // Don't render anything else. must be the + // last element in the file, with no closing tag. + return plaintextAbort + } + default: + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := render1(w, c); err != nil { + return err + } + } + } + + // Render the closing tag. + if _, err := w.WriteString(""); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + return w.WriteByte('>') +} + +// writeQuoted writes s to w surrounded by quotes. Normally it will use double +// quotes, but if s contains a double quote, it will use single quotes. +// It is used for writing the identifiers in a doctype declaration. +// In valid HTML, they can't contain both types of quotes. +func writeQuoted(w writer, s string) error { + var q byte = '"' + if strings.Contains(s, `"`) { + q = '\'' + } + if err := w.WriteByte(q); err != nil { + return err + } + if _, err := w.WriteString(s); err != nil { + return err + } + if err := w.WriteByte(q); err != nil { + return err + } + return nil +} + +// Section 12.1.2, "Elements", gives this list of void elements. Void elements +// are those that can't have any contents. +var voidElements = map[string]bool{ + "area": true, + "base": true, + "br": true, + "col": true, + "embed": true, + "hr": true, + "img": true, + "input": true, + "keygen": true, // "keygen" has been removed from the spec, but are kept here for backwards compatibility. + "link": true, + "meta": true, + "param": true, + "source": true, + "track": true, + "wbr": true, +} diff --git a/vendor/golang.org/x/net/html/token.go b/vendor/golang.org/x/net/html/token.go new file mode 100644 index 0000000..877709f --- /dev/null +++ b/vendor/golang.org/x/net/html/token.go @@ -0,0 +1,1224 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "errors" + "io" + "strconv" + "strings" + + "golang.org/x/net/html/atom" +) + +// A TokenType is the type of a Token. +type TokenType uint32 + +const ( + // ErrorToken means that an error occurred during tokenization. + ErrorToken TokenType = iota + // TextToken means a text node. + TextToken + // A StartTagToken looks like . + StartTagToken + // An EndTagToken looks like . + EndTagToken + // A SelfClosingTagToken tag looks like . + SelfClosingTagToken + // A CommentToken looks like . + CommentToken + // A DoctypeToken looks like + DoctypeToken +) + +// ErrBufferExceeded means that the buffering limit was exceeded. +var ErrBufferExceeded = errors.New("max buffer exceeded") + +// String returns a string representation of the TokenType. +func (t TokenType) String() string { + switch t { + case ErrorToken: + return "Error" + case TextToken: + return "Text" + case StartTagToken: + return "StartTag" + case EndTagToken: + return "EndTag" + case SelfClosingTagToken: + return "SelfClosingTag" + case CommentToken: + return "Comment" + case DoctypeToken: + return "Doctype" + } + return "Invalid(" + strconv.Itoa(int(t)) + ")" +} + +// An Attribute is an attribute namespace-key-value triple. Namespace is +// non-empty for foreign attributes like xlink, Key is alphabetic (and hence +// does not contain escapable characters like '&', '<' or '>'), and Val is +// unescaped (it looks like "a" + case EndTagToken: + return "" + t.tagString() + ">" + case SelfClosingTagToken: + return "<" + t.tagString() + "/>" + case CommentToken: + return "" + case DoctypeToken: + return "" + } + return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" +} + +// span is a range of bytes in a Tokenizer's buffer. The start is inclusive, +// the end is exclusive. +type span struct { + start, end int +} + +// A Tokenizer returns a stream of HTML Tokens. +type Tokenizer struct { + // r is the source of the HTML text. + r io.Reader + // tt is the TokenType of the current token. + tt TokenType + // err is the first error encountered during tokenization. It is possible + // for tt != Error && err != nil to hold: this means that Next returned a + // valid token but the subsequent Next call will return an error token. + // For example, if the HTML text input was just "plain", then the first + // Next call would set z.err to io.EOF but return a TextToken, and all + // subsequent Next calls would return an ErrorToken. + // err is never reset. Once it becomes non-nil, it stays non-nil. + err error + // readErr is the error returned by the io.Reader r. It is separate from + // err because it is valid for an io.Reader to return (n int, err1 error) + // such that n > 0 && err1 != nil, and callers should always process the + // n > 0 bytes before considering the error err1. + readErr error + // buf[raw.start:raw.end] holds the raw bytes of the current token. + // buf[raw.end:] is buffered input that will yield future tokens. + raw span + buf []byte + // maxBuf limits the data buffered in buf. A value of 0 means unlimited. + maxBuf int + // buf[data.start:data.end] holds the raw bytes of the current token's data: + // a text token's text, a tag token's tag name, etc. + data span + // pendingAttr is the attribute key and value currently being tokenized. + // When complete, pendingAttr is pushed onto attr. nAttrReturned is + // incremented on each call to TagAttr. + pendingAttr [2]span + attr [][2]span + nAttrReturned int + // rawTag is the "script" in "" that closes the next token. If + // non-empty, the subsequent call to Next will return a raw or RCDATA text + // token: one that treats "" as text instead of an element. + // rawTag's contents are lower-cased. + rawTag string + // textIsRaw is whether the current text token's data is not escaped. + textIsRaw bool + // convertNUL is whether NUL bytes in the current token's data should + // be converted into \ufffd replacement characters. + convertNUL bool + // allowCDATA is whether CDATA sections are allowed in the current context. + allowCDATA bool +} + +// AllowCDATA sets whether or not the tokenizer recognizes as +// the text "foo". The default value is false, which means to recognize it as +// a bogus comment "" instead. +// +// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and +// only if tokenizing foreign content, such as MathML and SVG. However, +// tracking foreign-contentness is difficult to do purely in the tokenizer, +// as opposed to the parser, due to HTML integration points: an element +// can contain a that is foreign-to-SVG but not foreign-to- +// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call AllowCDATA as appropriate. +// In practice, if using the tokenizer without caring whether MathML or SVG +// CDATA is text or comments, such as tokenizing HTML to find all the anchor +// text, it is acceptable to ignore this responsibility. +func (z *Tokenizer) AllowCDATA(allowCDATA bool) { + z.allowCDATA = allowCDATA +} + +// NextIsNotRawText instructs the tokenizer that the next token should not be +// considered as 'raw text'. Some elements, such as script and title elements, +// normally require the next token after the opening tag to be 'raw text' that +// has no child elements. For example, tokenizing "acd" +// yields a start tag token for "", a text token for "acd", and +// an end tag token for "". There are no distinct start tag or end tag +// tokens for the "" and "". +// +// This tokenizer implementation will generally look for raw text at the right +// times. Strictly speaking, an HTML5 compliant tokenizer should not look for +// raw text if in foreign content: generally needs raw text, but a +// inside an does not. Another example is that a +// generally needs raw text, but a is not allowed as an immediate +// child of a ; in normal parsing, a implies , but +// one cannot close the implicit element when parsing a 's InnerHTML. +// Similarly to AllowCDATA, tracking the correct moment to override raw-text- +// ness is difficult to do purely in the tokenizer, as opposed to the parser. +// For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call NextIsNotRawText as +// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this +// responsibility for basic usage. +// +// Note that this 'raw text' concept is different from the one offered by the +// Tokenizer.Raw method. +func (z *Tokenizer) NextIsNotRawText() { + z.rawTag = "" +} + +// Err returns the error associated with the most recent ErrorToken token. +// This is typically io.EOF, meaning the end of tokenization. +func (z *Tokenizer) Err() error { + if z.tt != ErrorToken { + return nil + } + return z.err +} + +// readByte returns the next byte from the input stream, doing a buffered read +// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte +// slice that holds all the bytes read so far for the current token. +// It sets z.err if the underlying reader returns an error. +// Pre-condition: z.err == nil. +func (z *Tokenizer) readByte() byte { + if z.raw.end >= len(z.buf) { + // Our buffer is exhausted and we have to read from z.r. Check if the + // previous read resulted in an error. + if z.readErr != nil { + z.err = z.readErr + return 0 + } + // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length + // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we + // allocate a new buffer before the copy. + c := cap(z.buf) + d := z.raw.end - z.raw.start + var buf1 []byte + if 2*d > c { + buf1 = make([]byte, d, 2*c) + } else { + buf1 = z.buf[:d] + } + copy(buf1, z.buf[z.raw.start:z.raw.end]) + if x := z.raw.start; x != 0 { + // Adjust the data/attr spans to refer to the same contents after the copy. + z.data.start -= x + z.data.end -= x + z.pendingAttr[0].start -= x + z.pendingAttr[0].end -= x + z.pendingAttr[1].start -= x + z.pendingAttr[1].end -= x + for i := range z.attr { + z.attr[i][0].start -= x + z.attr[i][0].end -= x + z.attr[i][1].start -= x + z.attr[i][1].end -= x + } + } + z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d] + // Now that we have copied the live bytes to the start of the buffer, + // we read from z.r into the remainder. + var n int + n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)]) + if n == 0 { + z.err = z.readErr + return 0 + } + z.buf = buf1[:d+n] + } + x := z.buf[z.raw.end] + z.raw.end++ + if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf { + z.err = ErrBufferExceeded + return 0 + } + return x +} + +// Buffered returns a slice containing data buffered but not yet tokenized. +func (z *Tokenizer) Buffered() []byte { + return z.buf[z.raw.end:] +} + +// readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil). +// It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil) +// too many times in succession. +func readAtLeastOneByte(r io.Reader, b []byte) (int, error) { + for i := 0; i < 100; i++ { + if n, err := r.Read(b); n != 0 || err != nil { + return n, err + } + } + return 0, io.ErrNoProgress +} + +// skipWhiteSpace skips past any white space. +func (z *Tokenizer) skipWhiteSpace() { + if z.err != nil { + return + } + for { + c := z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + // No-op. + default: + z.raw.end-- + return + } + } +} + +// readRawOrRCDATA reads until the next "", where "foo" is z.rawTag and +// is typically something like "script" or "textarea". +func (z *Tokenizer) readRawOrRCDATA() { + if z.rawTag == "script" { + z.readScript() + z.textIsRaw = true + z.rawTag = "" + return + } +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + c = z.readByte() + if z.err != nil { + break loop + } + if c != '/' { + z.raw.end-- + continue loop + } + if z.readRawEndTag() || z.err != nil { + break loop + } + } + z.data.end = z.raw.end + // A textarea's or title's RCDATA can contain escaped entities. + z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title" + z.rawTag = "" +} + +// readRawEndTag attempts to read a tag like "", where "foo" is z.rawTag. +// If it succeeds, it backs up the input position to reconsume the tag and +// returns true. Otherwise it returns false. The opening "" has already been +// consumed. +func (z *Tokenizer) readRawEndTag() bool { + for i := 0; i < len(z.rawTag); i++ { + c := z.readByte() + if z.err != nil { + return false + } + if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { + z.raw.end-- + return false + } + } + c := z.readByte() + if z.err != nil { + return false + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + // The 3 is 2 for the leading "" plus 1 for the trailing character c. + z.raw.end -= 3 + len(z.rawTag) + return true + } + z.raw.end-- + return false +} + +// readScript reads until the next tag, following the byzantine +// rules for escaping/hiding the closing tag. +func (z *Tokenizer) readScript() { + defer func() { + z.data.end = z.raw.end + }() + var c byte + +scriptData: + c = z.readByte() + if z.err != nil { + return + } + if c == '<' { + goto scriptDataLessThanSign + } + goto scriptData + +scriptDataLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '/': + goto scriptDataEndTagOpen + case '!': + goto scriptDataEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptData + +scriptDataEscapeStart: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapeStartDash + } + z.raw.end-- + goto scriptData + +scriptDataEscapeStartDash: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapedDashDash + } + z.raw.end-- + goto scriptData + +scriptDataEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataEscaped + +scriptDataEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataEscapedEndTagOpen + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + goto scriptDataDoubleEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEscapedEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptDataEscaped + +scriptDataDoubleEscapeStart: + z.raw.end-- + for i := 0; i < len("script"); i++ { + c = z.readByte() + if z.err != nil { + return + } + if c != "script"[i] && c != "SCRIPT"[i] { + z.raw.end-- + goto scriptDataEscaped + } + } + c = z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + goto scriptDataDoubleEscaped + } + z.raw.end-- + goto scriptDataEscaped + +scriptDataDoubleEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataDoubleEscapeEnd + } + z.raw.end-- + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapeEnd: + if z.readRawEndTag() { + z.raw.end += len("") + goto scriptDataEscaped + } + if z.err != nil { + return + } + goto scriptDataDoubleEscaped +} + +// readComment reads the next comment token starting with ". + z.data.end = z.data.start + } + }() + for dashCount := 2; ; { + c := z.readByte() + if z.err != nil { + // Ignore up to two dashes at EOF. + if dashCount > 2 { + dashCount = 2 + } + z.data.end = z.raw.end - dashCount + return + } + switch c { + case '-': + dashCount++ + continue + case '>': + if dashCount >= 2 { + z.data.end = z.raw.end - len("-->") + return + } + case '!': + if dashCount >= 2 { + c = z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len("--!>") + return + } + } + } + dashCount = 0 + } +} + +// readUntilCloseAngle reads until the next ">". +func (z *Tokenizer) readUntilCloseAngle() { + z.data.start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len(">") + return + } + } +} + +// readMarkupDeclaration reads the next token starting with "", a "", a "" or +// "': + if brackets >= 2 { + z.data.end = z.raw.end - len("]]>") + return true + } + brackets = 0 + default: + brackets = 0 + } + } +} + +// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end] +// case-insensitively matches any element of ss. +func (z *Tokenizer) startTagIn(ss ...string) bool { +loop: + for _, s := range ss { + if z.data.end-z.data.start != len(s) { + continue loop + } + for i := 0; i < len(s); i++ { + c := z.buf[z.data.start+i] + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + if c != s[i] { + continue loop + } + } + return true + } + return false +} + +// readStartTag reads the next start tag token. The opening "". + if z.err == nil && z.buf[z.raw.end-2] == '/' { + return SelfClosingTagToken + } + return StartTagToken +} + +// readTag reads the next tag token and its attributes. If saveAttr, those +// attributes are saved in z.attr, otherwise z.attr is set to an empty slice. +// The opening "' { + break + } + z.raw.end-- + z.readTagAttrKey() + z.readTagAttrVal() + // Save pendingAttr if saveAttr and that attribute has a non-empty key. + if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end { + z.attr = append(z.attr, z.pendingAttr) + } + if z.skipWhiteSpace(); z.err != nil { + break + } + } +} + +// readTagName sets z.data to the "div" in "". The reader (z.raw.end) +// is positioned such that the first byte of the tag name (the "d" in "': + z.raw.end-- + z.data.end = z.raw.end + return + } + } +} + +// readTagAttrKey sets z.pendingAttr[0] to the "k" in "". +// Precondition: z.err == nil. +func (z *Tokenizer) readTagAttrKey() { + z.pendingAttr[0].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[0].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/': + z.pendingAttr[0].end = z.raw.end - 1 + return + case '=', '>': + z.raw.end-- + z.pendingAttr[0].end = z.raw.end + return + } + } +} + +// readTagAttrVal sets z.pendingAttr[1] to the "v" in "". +func (z *Tokenizer) readTagAttrVal() { + z.pendingAttr[1].start = z.raw.end + z.pendingAttr[1].end = z.raw.end + if z.skipWhiteSpace(); z.err != nil { + return + } + c := z.readByte() + if z.err != nil { + return + } + if c != '=' { + z.raw.end-- + return + } + if z.skipWhiteSpace(); z.err != nil { + return + } + quote := z.readByte() + if z.err != nil { + return + } + switch quote { + case '>': + z.raw.end-- + return + + case '\'', '"': + z.pendingAttr[1].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + if c == quote { + z.pendingAttr[1].end = z.raw.end - 1 + return + } + } + + default: + z.pendingAttr[1].start = z.raw.end - 1 + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + z.pendingAttr[1].end = z.raw.end - 1 + return + case '>': + z.raw.end-- + z.pendingAttr[1].end = z.raw.end + return + } + } + } +} + +// Next scans the next token and returns its type. +func (z *Tokenizer) Next() TokenType { + z.raw.start = z.raw.end + z.data.start = z.raw.end + z.data.end = z.raw.end + if z.err != nil { + z.tt = ErrorToken + return z.tt + } + if z.rawTag != "" { + if z.rawTag == "plaintext" { + // Read everything up to EOF. + for z.err == nil { + z.readByte() + } + z.data.end = z.raw.end + z.textIsRaw = true + } else { + z.readRawOrRCDATA() + } + if z.data.end > z.data.start { + z.tt = TextToken + z.convertNUL = true + return z.tt + } + } + z.textIsRaw = false + z.convertNUL = false + +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + + // Check if the '<' we have just read is part of a tag, comment + // or doctype. If not, it's part of the accumulated text token. + c = z.readByte() + if z.err != nil { + break loop + } + var tokenType TokenType + switch { + case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': + tokenType = StartTagToken + case c == '/': + tokenType = EndTagToken + case c == '!' || c == '?': + // We use CommentToken to mean any of "", + // "" and "". + tokenType = CommentToken + default: + // Reconsume the current character. + z.raw.end-- + continue + } + + // We have a non-text token, but we might have accumulated some text + // before that. If so, we return the text first, and return the non- + // text token on the subsequent call to Next. + if x := z.raw.end - len("' { + // ">" does not generate a token at all. Generate an empty comment + // to allow passthrough clients to pick up the data using Raw. + // Reset the tokenizer state and start again. + z.tt = CommentToken + return z.tt + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + z.readTag(false) + if z.err != nil { + z.tt = ErrorToken + } else { + z.tt = EndTagToken + } + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + case CommentToken: + if c == '!' { + z.tt = z.readMarkupDeclaration() + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + } + } + if z.raw.start < z.raw.end { + z.data.end = z.raw.end + z.tt = TextToken + return z.tt + } + z.tt = ErrorToken + return z.tt +} + +// Raw returns the unmodified text of the current token. Calling Next, Token, +// Text, TagName or TagAttr may change the contents of the returned slice. +// +// The token stream's raw bytes partition the byte stream (up until an +// ErrorToken). There are no overlaps or gaps between two consecutive token's +// raw bytes. One implication is that the byte offset of the current token is +// the sum of the lengths of all previous tokens' raw bytes. +func (z *Tokenizer) Raw() []byte { + return z.buf[z.raw.start:z.raw.end] +} + +// convertNewlines converts "\r" and "\r\n" in s to "\n". +// The conversion happens in place, but the resulting slice may be shorter. +func convertNewlines(s []byte) []byte { + for i, c := range s { + if c != '\r' { + continue + } + + src := i + 1 + if src >= len(s) || s[src] != '\n' { + s[i] = '\n' + continue + } + + dst := i + for src < len(s) { + if s[src] == '\r' { + if src+1 < len(s) && s[src+1] == '\n' { + src++ + } + s[dst] = '\n' + } else { + s[dst] = s[src] + } + src++ + dst++ + } + return s[:dst] + } + return s +} + +var ( + nul = []byte("\x00") + replacement = []byte("\ufffd") +) + +// Text returns the unescaped text of a text, comment or doctype token. The +// contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) Text() []byte { + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + s = convertNewlines(s) + if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) { + s = bytes.Replace(s, nul, replacement, -1) + } + if !z.textIsRaw { + s = unescape(s, false) + } + return s + } + return nil +} + +// TagName returns the lower-cased name of a tag token (the `img` out of +// ``) and whether the tag has attributes. +// The contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { + if z.data.start < z.data.end { + switch z.tt { + case StartTagToken, EndTagToken, SelfClosingTagToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + return lower(s), z.nAttrReturned < len(z.attr) + } + } + return nil, false +} + +// TagAttr returns the lower-cased key and unescaped value of the next unparsed +// attribute for the current tag token and whether there are more attributes. +// The contents of the returned slices may change on the next call to Next. +func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { + if z.nAttrReturned < len(z.attr) { + switch z.tt { + case StartTagToken, SelfClosingTagToken: + x := z.attr[z.nAttrReturned] + z.nAttrReturned++ + key = z.buf[x[0].start:x[0].end] + val = z.buf[x[1].start:x[1].end] + return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr) + } + } + return nil, nil, false +} + +// Token returns the current Token. The result's Data and Attr values remain +// valid after subsequent Next calls. +func (z *Tokenizer) Token() Token { + t := Token{Type: z.tt} + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + t.Data = string(z.Text()) + case StartTagToken, SelfClosingTagToken, EndTagToken: + name, moreAttr := z.TagName() + for moreAttr { + var key, val []byte + key, val, moreAttr = z.TagAttr() + t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) + } + if a := atom.Lookup(name); a != 0 { + t.DataAtom, t.Data = a, a.String() + } else { + t.DataAtom, t.Data = 0, string(name) + } + } + return t +} + +// SetMaxBuf sets a limit on the amount of data buffered during tokenization. +// A value of 0 means unlimited. +func (z *Tokenizer) SetMaxBuf(n int) { + z.maxBuf = n +} + +// NewTokenizer returns a new HTML Tokenizer for the given Reader. +// The input is assumed to be UTF-8 encoded. +func NewTokenizer(r io.Reader) *Tokenizer { + return NewTokenizerFragment(r, "") +} + +// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for +// tokenizing an existing element's InnerHTML fragment. contextTag is that +// element's tag, such as "div" or "iframe". +// +// For example, how the InnerHTML "a tag or a
block. + if d != "" && d[0] == '\r' { + d = d[1:] + } + if d != "" && d[0] == '\n' { + d = d[1:] + } + } + } + d = strings.Replace(d, "\x00", "", -1) + if d == "" { + return true + } + p.reconstructActiveFormattingElements() + p.addText(d) + if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { + // There were non-whitespace characters inserted. + p.framesetOK = false + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + if p.oe.contains(a.Template) { + return true + } + copyAttributes(p.oe[0], p.tok) + case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: + return inHeadIM(p) + case a.Body: + if p.oe.contains(a.Template) { + return true + } + if len(p.oe) >= 2 { + body := p.oe[1] + if body.Type == ElementNode && body.DataAtom == a.Body { + p.framesetOK = false + copyAttributes(body, p.tok) + } + } + case a.Frameset: + if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { + // Ignore the token. + return true + } + body := p.oe[1] + if body.Parent != nil { + body.Parent.RemoveChild(body) + } + p.oe = p.oe[:1] + p.addElement() + p.im = inFramesetIM + return true + case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: + p.popUntil(buttonScope, a.P) + p.addElement() + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.popUntil(buttonScope, a.P) + switch n := p.top(); n.DataAtom { + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.oe.pop() + } + p.addElement() + case a.Pre, a.Listing: + p.popUntil(buttonScope, a.P) + p.addElement() + // The newline, if any, will be dealt with by the TextToken case. + p.framesetOK = false + case a.Form: + if p.form != nil && !p.oe.contains(a.Template) { + // Ignore the token + return true + } + p.popUntil(buttonScope, a.P) + p.addElement() + if !p.oe.contains(a.Template) { + p.form = p.top() + } + case a.Li: + p.framesetOK = false + for i := len(p.oe) - 1; i >= 0; i-- { + node := p.oe[i] + switch node.DataAtom { + case a.Li: + p.oe = p.oe[:i] + case a.Address, a.Div, a.P: + continue + default: + if !isSpecialElement(node) { + continue + } + } + break + } + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Dd, a.Dt: + p.framesetOK = false + for i := len(p.oe) - 1; i >= 0; i-- { + node := p.oe[i] + switch node.DataAtom { + case a.Dd, a.Dt: + p.oe = p.oe[:i] + case a.Address, a.Div, a.P: + continue + default: + if !isSpecialElement(node) { + continue + } + } + break + } + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Plaintext: + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Button: + p.popUntil(defaultScope, a.Button) + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + case a.A: + for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { + if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { + p.inBodyEndTagFormatting(a.A, "a") + p.oe.remove(n) + p.afe.remove(n) + break + } + } + p.reconstructActiveFormattingElements() + p.addFormattingElement() + case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: + p.reconstructActiveFormattingElements() + p.addFormattingElement() + case a.Nobr: + p.reconstructActiveFormattingElements() + if p.elementInScope(defaultScope, a.Nobr) { + p.inBodyEndTagFormatting(a.Nobr, "nobr") + p.reconstructActiveFormattingElements() + } + p.addFormattingElement() + case a.Applet, a.Marquee, a.Object: + p.reconstructActiveFormattingElements() + p.addElement() + p.afe = append(p.afe, &scopeMarker) + p.framesetOK = false + case a.Table: + if !p.quirks { + p.popUntil(buttonScope, a.P) + } + p.addElement() + p.framesetOK = false + p.im = inTableIM + return true + case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: + p.reconstructActiveFormattingElements() + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + if p.tok.DataAtom == a.Input { + for _, t := range p.tok.Attr { + if t.Key == "type" { + if strings.ToLower(t.Val) == "hidden" { + // Skip setting framesetOK = false + return true + } + } + } + } + p.framesetOK = false + case a.Param, a.Source, a.Track: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + case a.Hr: + p.popUntil(buttonScope, a.P) + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + p.framesetOK = false + case a.Image: + p.tok.DataAtom = a.Img + p.tok.Data = a.Img.String() + return false + case a.Textarea: + p.addElement() + p.setOriginalIM() + p.framesetOK = false + p.im = textIM + case a.Xmp: + p.popUntil(buttonScope, a.P) + p.reconstructActiveFormattingElements() + p.framesetOK = false + p.parseGenericRawTextElement() + case a.Iframe: + p.framesetOK = false + p.parseGenericRawTextElement() + case a.Noembed: + p.parseGenericRawTextElement() + case a.Noscript: + if p.scripting { + p.parseGenericRawTextElement() + return true + } + p.reconstructActiveFormattingElements() + p.addElement() + // Don't let the tokenizer go into raw text mode when scripting is disabled. + p.tokenizer.NextIsNotRawText() + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectIM + return true + case a.Optgroup, a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + p.reconstructActiveFormattingElements() + p.addElement() + case a.Rb, a.Rtc: + if p.elementInScope(defaultScope, a.Ruby) { + p.generateImpliedEndTags() + } + p.addElement() + case a.Rp, a.Rt: + if p.elementInScope(defaultScope, a.Ruby) { + p.generateImpliedEndTags("rtc") + } + p.addElement() + case a.Math, a.Svg: + p.reconstructActiveFormattingElements() + if p.tok.DataAtom == a.Math { + adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) + } else { + adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) + } + adjustForeignAttributes(p.tok.Attr) + p.addElement() + p.top().Namespace = p.tok.Data + if p.hasSelfClosingToken { + p.oe.pop() + p.acknowledgeSelfClosingTag() + } + return true + case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + default: + p.reconstructActiveFormattingElements() + p.addElement() + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Body: + if p.elementInScope(defaultScope, a.Body) { + p.im = afterBodyIM + } + case a.Html: + if p.elementInScope(defaultScope, a.Body) { + p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) + return false + } + return true + case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: + p.popUntil(defaultScope, p.tok.DataAtom) + case a.Form: + if p.oe.contains(a.Template) { + i := p.indexOfElementInScope(defaultScope, a.Form) + if i == -1 { + // Ignore the token. + return true + } + p.generateImpliedEndTags() + if p.oe[i].DataAtom != a.Form { + // Ignore the token. + return true + } + p.popUntil(defaultScope, a.Form) + } else { + node := p.form + p.form = nil + i := p.indexOfElementInScope(defaultScope, a.Form) + if node == nil || i == -1 || p.oe[i] != node { + // Ignore the token. + return true + } + p.generateImpliedEndTags() + p.oe.remove(node) + } + case a.P: + if !p.elementInScope(buttonScope, a.P) { + p.parseImpliedToken(StartTagToken, a.P, a.P.String()) + } + p.popUntil(buttonScope, a.P) + case a.Li: + p.popUntil(listItemScope, a.Li) + case a.Dd, a.Dt: + p.popUntil(defaultScope, p.tok.DataAtom) + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) + case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: + p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data) + case a.Applet, a.Marquee, a.Object: + if p.popUntil(defaultScope, p.tok.DataAtom) { + p.clearActiveFormattingElements() + } + case a.Br: + p.tok.Type = StartTagToken + return false + case a.Template: + return inHeadIM(p) + default: + p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case ErrorToken: + // TODO: remove this divergence from the HTML5 spec. + if len(p.templateStack) > 0 { + p.im = inTemplateIM + return false + } + for _, e := range p.oe { + switch e.DataAtom { + case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th, + a.Thead, a.Tr, a.Body, a.Html: + default: + return true + } + } + } + + return true +} + +func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) { + // This is the "adoption agency" algorithm, described at + // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency + + // TODO: this is a fairly literal line-by-line translation of that algorithm. + // Once the code successfully parses the comprehensive test suite, we should + // refactor this code to be more idiomatic. + + // Steps 1-2 + if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 { + p.oe.pop() + return + } + + // Steps 3-5. The outer loop. + for i := 0; i < 8; i++ { + // Step 6. Find the formatting element. + var formattingElement *Node + for j := len(p.afe) - 1; j >= 0; j-- { + if p.afe[j].Type == scopeMarkerNode { + break + } + if p.afe[j].DataAtom == tagAtom { + formattingElement = p.afe[j] + break + } + } + if formattingElement == nil { + p.inBodyEndTagOther(tagAtom, tagName) + return + } + + // Step 7. Ignore the tag if formatting element is not in the stack of open elements. + feIndex := p.oe.index(formattingElement) + if feIndex == -1 { + p.afe.remove(formattingElement) + return + } + // Step 8. Ignore the tag if formatting element is not in the scope. + if !p.elementInScope(defaultScope, tagAtom) { + // Ignore the tag. + return + } + + // Step 9. This step is omitted because it's just a parse error but no need to return. + + // Steps 10-11. Find the furthest block. + var furthestBlock *Node + for _, e := range p.oe[feIndex:] { + if isSpecialElement(e) { + furthestBlock = e + break + } + } + if furthestBlock == nil { + e := p.oe.pop() + for e != formattingElement { + e = p.oe.pop() + } + p.afe.remove(e) + return + } + + // Steps 12-13. Find the common ancestor and bookmark node. + commonAncestor := p.oe[feIndex-1] + bookmark := p.afe.index(formattingElement) + + // Step 14. The inner loop. Find the lastNode to reparent. + lastNode := furthestBlock + node := furthestBlock + x := p.oe.index(node) + // Step 14.1. + j := 0 + for { + // Step 14.2. + j++ + // Step. 14.3. + x-- + node = p.oe[x] + // Step 14.4. Go to the next step if node is formatting element. + if node == formattingElement { + break + } + // Step 14.5. Remove node from the list of active formatting elements if + // inner loop counter is greater than three and node is in the list of + // active formatting elements. + if ni := p.afe.index(node); j > 3 && ni > -1 { + p.afe.remove(node) + // If any element of the list of active formatting elements is removed, + // we need to take care whether bookmark should be decremented or not. + // This is because the value of bookmark may exceed the size of the + // list by removing elements from the list. + if ni <= bookmark { + bookmark-- + } + continue + } + // Step 14.6. Continue the next inner loop if node is not in the list of + // active formatting elements. + if p.afe.index(node) == -1 { + p.oe.remove(node) + continue + } + // Step 14.7. + clone := node.clone() + p.afe[p.afe.index(node)] = clone + p.oe[p.oe.index(node)] = clone + node = clone + // Step 14.8. + if lastNode == furthestBlock { + bookmark = p.afe.index(node) + 1 + } + // Step 14.9. + if lastNode.Parent != nil { + lastNode.Parent.RemoveChild(lastNode) + } + node.AppendChild(lastNode) + // Step 14.10. + lastNode = node + } + + // Step 15. Reparent lastNode to the common ancestor, + // or for misnested table nodes, to the foster parent. + if lastNode.Parent != nil { + lastNode.Parent.RemoveChild(lastNode) + } + switch commonAncestor.DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + p.fosterParent(lastNode) + default: + commonAncestor.AppendChild(lastNode) + } + + // Steps 16-18. Reparent nodes from the furthest block's children + // to a clone of the formatting element. + clone := formattingElement.clone() + reparentChildren(clone, furthestBlock) + furthestBlock.AppendChild(clone) + + // Step 19. Fix up the list of active formatting elements. + if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { + // Move the bookmark with the rest of the list. + bookmark-- + } + p.afe.remove(formattingElement) + p.afe.insert(bookmark, clone) + + // Step 20. Fix up the stack of open elements. + p.oe.remove(formattingElement) + p.oe.insert(p.oe.index(furthestBlock)+1, clone) + } +} + +// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. +// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content +// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign +func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) { + for i := len(p.oe) - 1; i >= 0; i-- { + // Two element nodes have the same tag if they have the same Data (a + // string-typed field). As an optimization, for common HTML tags, each + // Data string is assigned a unique, non-zero DataAtom (a uint32-typed + // field), since integer comparison is faster than string comparison. + // Uncommon (custom) tags get a zero DataAtom. + // + // The if condition here is equivalent to (p.oe[i].Data == tagName). + if (p.oe[i].DataAtom == tagAtom) && + ((tagAtom != 0) || (p.oe[i].Data == tagName)) { + p.oe = p.oe[:i] + break + } + if isSpecialElement(p.oe[i]) { + break + } + } +} + +// Section 12.2.6.4.8. +func textIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + p.oe.pop() + case TextToken: + d := p.tok.Data + if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { + // Ignore a newline at the start of a block. + if d != "" && d[0] == '\r' { + d = d[1:] + } + if d != "" && d[0] == '\n' { + d = d[1:] + } + } + if d == "" { + return true + } + p.addText(d) + return true + case EndTagToken: + p.oe.pop() + } + p.im = p.originalIM + p.originalIM = nil + return p.tok.Type == EndTagToken +} + +// Section 12.2.6.4.9. +func inTableIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) + switch p.oe.top().DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + if strings.Trim(p.tok.Data, whitespace) == "" { + p.addText(p.tok.Data) + return true + } + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Caption: + p.clearStackToContext(tableScope) + p.afe = append(p.afe, &scopeMarker) + p.addElement() + p.im = inCaptionIM + return true + case a.Colgroup: + p.clearStackToContext(tableScope) + p.addElement() + p.im = inColumnGroupIM + return true + case a.Col: + p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) + return false + case a.Tbody, a.Tfoot, a.Thead: + p.clearStackToContext(tableScope) + p.addElement() + p.im = inTableBodyIM + return true + case a.Td, a.Th, a.Tr: + p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) + return false + case a.Table: + if p.popUntil(tableScope, a.Table) { + p.resetInsertionMode() + return false + } + // Ignore the token. + return true + case a.Style, a.Script, a.Template: + return inHeadIM(p) + case a.Input: + for _, t := range p.tok.Attr { + if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { + p.addElement() + p.oe.pop() + return true + } + } + // Otherwise drop down to the default action. + case a.Form: + if p.oe.contains(a.Template) || p.form != nil { + // Ignore the token. + return true + } + p.addElement() + p.form = p.oe.pop() + case a.Select: + p.reconstructActiveFormattingElements() + switch p.top().DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + p.fosterParenting = true + } + p.addElement() + p.fosterParenting = false + p.framesetOK = false + p.im = inSelectInTableIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Table: + if p.popUntil(tableScope, a.Table) { + p.resetInsertionMode() + return true + } + // Ignore the token. + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + return true + case a.Template: + return inHeadIM(p) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + // Ignore the token. + return true + case ErrorToken: + return inBodyIM(p) + } + + p.fosterParenting = true + defer func() { p.fosterParenting = false }() + + return inBodyIM(p) +} + +// Section 12.2.6.4.11. +func inCaptionIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: + if !p.popUntil(tableScope, a.Caption) { + // Ignore the token. + return true + } + p.clearActiveFormattingElements() + p.im = inTableIM + return false + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectInTableIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Caption: + if p.popUntil(tableScope, a.Caption) { + p.clearActiveFormattingElements() + p.im = inTableIM + } + return true + case a.Table: + if !p.popUntil(tableScope, a.Caption) { + // Ignore the token. + return true + } + p.clearActiveFormattingElements() + p.im = inTableIM + return false + case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + return true + } + } + return inBodyIM(p) +} + +// Section 12.2.6.4.12. +func inColumnGroupIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) < len(p.tok.Data) { + // Add the initial whitespace to the current node. + p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) + if s == "" { + return true + } + p.tok.Data = s + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + // Ignore the token. + return true + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Col: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + return true + case a.Template: + return inHeadIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Colgroup: + if p.oe.top().DataAtom == a.Colgroup { + p.oe.pop() + p.im = inTableIM + } + return true + case a.Col: + // Ignore the token. + return true + case a.Template: + return inHeadIM(p) + } + case ErrorToken: + return inBodyIM(p) + } + if p.oe.top().DataAtom != a.Colgroup { + return true + } + p.oe.pop() + p.im = inTableIM + return false +} + +// Section 12.2.6.4.13. +func inTableBodyIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Tr: + p.clearStackToContext(tableBodyScope) + p.addElement() + p.im = inRowIM + return true + case a.Td, a.Th: + p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) + return false + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: + if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { + p.im = inTableIM + return false + } + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Tbody, a.Tfoot, a.Thead: + if p.elementInScope(tableScope, p.tok.DataAtom) { + p.clearStackToContext(tableBodyScope) + p.oe.pop() + p.im = inTableIM + } + return true + case a.Table: + if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { + p.im = inTableIM + return false + } + // Ignore the token. + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: + // Ignore the token. + return true + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + } + + return inTableIM(p) +} + +// Section 12.2.6.4.14. +func inRowIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Td, a.Th: + p.clearStackToContext(tableRowScope) + p.addElement() + p.afe = append(p.afe, &scopeMarker) + p.im = inCellIM + return true + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: + if p.popUntil(tableScope, a.Tr) { + p.im = inTableBodyIM + return false + } + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Tr: + if p.popUntil(tableScope, a.Tr) { + p.im = inTableBodyIM + return true + } + // Ignore the token. + return true + case a.Table: + if p.popUntil(tableScope, a.Tr) { + p.im = inTableBodyIM + return false + } + // Ignore the token. + return true + case a.Tbody, a.Tfoot, a.Thead: + if p.elementInScope(tableScope, p.tok.DataAtom) { + p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) + return false + } + // Ignore the token. + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: + // Ignore the token. + return true + } + } + + return inTableIM(p) +} + +// Section 12.2.6.4.15. +func inCellIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken: + switch p.tok.DataAtom { + case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + if p.popUntil(tableScope, a.Td, a.Th) { + // Close the cell and reprocess. + p.clearActiveFormattingElements() + p.im = inRowIM + return false + } + // Ignore the token. + return true + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectInTableIM + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Td, a.Th: + if !p.popUntil(tableScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + p.clearActiveFormattingElements() + p.im = inRowIM + return true + case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: + // Ignore the token. + return true + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + if !p.elementInScope(tableScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + // Close the cell and reprocess. + if p.popUntil(tableScope, a.Td, a.Th) { + p.clearActiveFormattingElements() + } + p.im = inRowIM + return false + } + } + return inBodyIM(p) +} + +// Section 12.2.6.4.16. +func inSelectIM(p *parser) bool { + switch p.tok.Type { + case TextToken: + p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + p.addElement() + case a.Optgroup: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + if p.top().DataAtom == a.Optgroup { + p.oe.pop() + } + p.addElement() + case a.Select: + if !p.popUntil(selectScope, a.Select) { + // Ignore the token. + return true + } + p.resetInsertionMode() + case a.Input, a.Keygen, a.Textarea: + if p.elementInScope(selectScope, a.Select) { + p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) + return false + } + // In order to properly ignore , we need to change the tokenizer mode. + p.tokenizer.NextIsNotRawText() + // Ignore the token. + return true + case a.Script, a.Template: + return inHeadIM(p) + case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp: + // Don't let the tokenizer go into raw text mode when there are raw tags + // to be ignored. These tags should be ignored from the tokenizer + // properly. + p.tokenizer.NextIsNotRawText() + // Ignore the token. + return true + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + case a.Optgroup: + i := len(p.oe) - 1 + if p.oe[i].DataAtom == a.Option { + i-- + } + if p.oe[i].DataAtom == a.Optgroup { + p.oe = p.oe[:i] + } + case a.Select: + if !p.popUntil(selectScope, a.Select) { + // Ignore the token. + return true + } + p.resetInsertionMode() + case a.Template: + return inHeadIM(p) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case DoctypeToken: + // Ignore the token. + return true + case ErrorToken: + return inBodyIM(p) + } + + return true +} + +// Section 12.2.6.4.17. +func inSelectInTableIM(p *parser) bool { + switch p.tok.Type { + case StartTagToken, EndTagToken: + switch p.tok.DataAtom { + case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: + if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + // This is like p.popUntil(selectScope, a.Select), but it also + // matches , not just . Matching the MathML + // tag is arguably incorrect (conceptually), but it mimics what + // Chromium does. + for i := len(p.oe) - 1; i >= 0; i-- { + if n := p.oe[i]; n.DataAtom == a.Select { + p.oe = p.oe[:i] + break + } + } + p.resetInsertionMode() + return false + } + } + return inSelectIM(p) +} + +// Section 12.2.6.4.18. +func inTemplateIM(p *parser) bool { + switch p.tok.Type { + case TextToken, CommentToken, DoctypeToken: + return inBodyIM(p) + case StartTagToken: + switch p.tok.DataAtom { + case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: + return inHeadIM(p) + case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inTableIM) + p.im = inTableIM + return false + case a.Col: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inColumnGroupIM) + p.im = inColumnGroupIM + return false + case a.Tr: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inTableBodyIM) + p.im = inTableBodyIM + return false + case a.Td, a.Th: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inRowIM) + p.im = inRowIM + return false + default: + p.templateStack.pop() + p.templateStack = append(p.templateStack, inBodyIM) + p.im = inBodyIM + return false + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Template: + return inHeadIM(p) + default: + // Ignore the token. + return true + } + case ErrorToken: + if !p.oe.contains(a.Template) { + // Ignore the token. + return true + } + // TODO: remove this divergence from the HTML5 spec. + // + // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 + p.generateImpliedEndTags() + for i := len(p.oe) - 1; i >= 0; i-- { + if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { + p.oe = p.oe[:i] + break + } + } + p.clearActiveFormattingElements() + p.templateStack.pop() + p.resetInsertionMode() + return false + } + return false +} + +// Section 12.2.6.4.19. +func afterBodyIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + // Stop parsing. + return true + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) == 0 { + // It was all whitespace. + return inBodyIM(p) + } + case StartTagToken: + if p.tok.DataAtom == a.Html { + return inBodyIM(p) + } + case EndTagToken: + if p.tok.DataAtom == a.Html { + if !p.fragment { + p.im = afterAfterBodyIM + } + return true + } + case CommentToken: + // The comment is attached to the element. + if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { + panic("html: bad parser state: element not found, in the after-body insertion mode") + } + p.oe[0].AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + } + p.im = inBodyIM + return false +} + +// Section 12.2.6.4.20. +func inFramesetIM(p *parser) bool { + switch p.tok.Type { + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case TextToken: + // Ignore all text but whitespace. + s := strings.Map(func(c rune) rune { + switch c { + case ' ', '\t', '\n', '\f', '\r': + return c + } + return -1 + }, p.tok.Data) + if s != "" { + p.addText(s) + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Frameset: + p.addElement() + case a.Frame: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + case a.Noframes: + return inHeadIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Frameset: + if p.oe.top().DataAtom != a.Html { + p.oe.pop() + if p.oe.top().DataAtom != a.Frameset { + p.im = afterFramesetIM + return true + } + } + } + default: + // Ignore the token. + } + return true +} + +// Section 12.2.6.4.21. +func afterFramesetIM(p *parser) bool { + switch p.tok.Type { + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case TextToken: + // Ignore all text but whitespace. + s := strings.Map(func(c rune) rune { + switch c { + case ' ', '\t', '\n', '\f', '\r': + return c + } + return -1 + }, p.tok.Data) + if s != "" { + p.addText(s) + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Noframes: + return inHeadIM(p) + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Html: + p.im = afterAfterFramesetIM + return true + } + default: + // Ignore the token. + } + return true +} + +// Section 12.2.6.4.22. +func afterAfterBodyIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + // Stop parsing. + return true + case TextToken: + s := strings.TrimLeft(p.tok.Data, whitespace) + if len(s) == 0 { + // It was all whitespace. + return inBodyIM(p) + } + case StartTagToken: + if p.tok.DataAtom == a.Html { + return inBodyIM(p) + } + case CommentToken: + p.doc.AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + return true + case DoctypeToken: + return inBodyIM(p) + } + p.im = inBodyIM + return false +} + +// Section 12.2.6.4.23. +func afterAfterFramesetIM(p *parser) bool { + switch p.tok.Type { + case CommentToken: + p.doc.AppendChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case TextToken: + // Ignore all text but whitespace. + s := strings.Map(func(c rune) rune { + switch c { + case ' ', '\t', '\n', '\f', '\r': + return c + } + return -1 + }, p.tok.Data) + if s != "" { + p.tok.Data = s + return inBodyIM(p) + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + return inBodyIM(p) + case a.Noframes: + return inHeadIM(p) + } + case DoctypeToken: + return inBodyIM(p) + default: + // Ignore the token. + } + return true +} + +const whitespaceOrNUL = whitespace + "\x00" + +// Section 12.2.6.5 +func parseForeignContent(p *parser) bool { + switch p.tok.Type { + case TextToken: + if p.framesetOK { + p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" + } + p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) + p.addText(p.tok.Data) + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + case StartTagToken: + if !p.fragment { + b := breakout[p.tok.Data] + if p.tok.DataAtom == a.Font { + loop: + for _, attr := range p.tok.Attr { + switch attr.Key { + case "color", "face", "size": + b = true + break loop + } + } + } + if b { + for i := len(p.oe) - 1; i >= 0; i-- { + n := p.oe[i] + if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { + p.oe = p.oe[:i+1] + break + } + } + return false + } + } + current := p.adjustedCurrentNode() + switch current.Namespace { + case "math": + adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) + case "svg": + // Adjust SVG tag names. The tokenizer lower-cases tag names, but + // SVG wants e.g. "foreignObject" with a capital second "O". + if x := svgTagNameAdjustments[p.tok.Data]; x != "" { + p.tok.DataAtom = a.Lookup([]byte(x)) + p.tok.Data = x + } + adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) + default: + panic("html: bad parser state: unexpected namespace") + } + adjustForeignAttributes(p.tok.Attr) + namespace := current.Namespace + p.addElement() + p.top().Namespace = namespace + if namespace != "" { + // Don't let the tokenizer go into raw text mode in foreign content + // (e.g. in an SVG tag). + p.tokenizer.NextIsNotRawText() + } + if p.hasSelfClosingToken { + p.oe.pop() + p.acknowledgeSelfClosingTag() + } + case EndTagToken: + for i := len(p.oe) - 1; i >= 0; i-- { + if p.oe[i].Namespace == "" { + return p.im(p) + } + if strings.EqualFold(p.oe[i].Data, p.tok.Data) { + p.oe = p.oe[:i] + break + } + } + return true + default: + // Ignore the token. + } + return true +} + +// Section 12.2.4.2. +func (p *parser) adjustedCurrentNode() *Node { + if len(p.oe) == 1 && p.fragment && p.context != nil { + return p.context + } + return p.oe.top() +} + +// Section 12.2.6. +func (p *parser) inForeignContent() bool { + if len(p.oe) == 0 { + return false + } + n := p.adjustedCurrentNode() + if n.Namespace == "" { + return false + } + if mathMLTextIntegrationPoint(n) { + if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { + return false + } + if p.tok.Type == TextToken { + return false + } + } + if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { + return false + } + if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { + return false + } + if p.tok.Type == ErrorToken { + return false + } + return true +} + +// parseImpliedToken parses a token as though it had appeared in the parser's +// input. +func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { + realToken, selfClosing := p.tok, p.hasSelfClosingToken + p.tok = Token{ + Type: t, + DataAtom: dataAtom, + Data: data, + } + p.hasSelfClosingToken = false + p.parseCurrentToken() + p.tok, p.hasSelfClosingToken = realToken, selfClosing +} + +// parseCurrentToken runs the current token through the parsing routines +// until it is consumed. +func (p *parser) parseCurrentToken() { + if p.tok.Type == SelfClosingTagToken { + p.hasSelfClosingToken = true + p.tok.Type = StartTagToken + } + + consumed := false + for !consumed { + if p.inForeignContent() { + consumed = parseForeignContent(p) + } else { + consumed = p.im(p) + } + } + + if p.hasSelfClosingToken { + // This is a parse error, but ignore it. + p.hasSelfClosingToken = false + } +} + +func (p *parser) parse() error { + // Iterate until EOF. Any other error will cause an early return. + var err error + for err != io.EOF { + // CDATA sections are allowed only in foreign content. + n := p.oe.top() + p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") + // Read and parse the next token. + p.tokenizer.Next() + p.tok = p.tokenizer.Token() + if p.tok.Type == ErrorToken { + err = p.tokenizer.Err() + if err != nil && err != io.EOF { + return err + } + } + p.parseCurrentToken() + } + return nil +} + +// Parse returns the parse tree for the HTML from the given Reader. +// +// It implements the HTML5 parsing algorithm +// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction), +// which is very complicated. The resultant tree can contain implicitly created +// nodes that have no explicit listed in r's data, and nodes' parents can +// differ from the nesting implied by a naive processing of start and end +// s. Conversely, explicit s in r's data can be silently dropped, +// with no corresponding node in the resulting tree. +// +// The input is assumed to be UTF-8 encoded. +func Parse(r io.Reader) (*Node, error) { + return ParseWithOptions(r) +} + +// ParseFragment parses a fragment of HTML and returns the nodes that were +// found. If the fragment is the InnerHTML for an existing element, pass that +// element in context. +// +// It has the same intricacies as Parse. +func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { + return ParseFragmentWithOptions(r, context) +} + +// ParseOption configures a parser. +type ParseOption func(p *parser) + +// ParseOptionEnableScripting configures the scripting flag. +// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting +// +// By default, scripting is enabled. +func ParseOptionEnableScripting(enable bool) ParseOption { + return func(p *parser) { + p.scripting = enable + } +} + +// ParseWithOptions is like Parse, with options. +func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) { + p := &parser{ + tokenizer: NewTokenizer(r), + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + framesetOK: true, + im: initialIM, + } + + for _, f := range opts { + f(p) + } + + if err := p.parse(); err != nil { + return nil, err + } + return p.doc, nil +} + +// ParseFragmentWithOptions is like ParseFragment, with options. +func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) { + contextTag := "" + if context != nil { + if context.Type != ElementNode { + return nil, errors.New("html: ParseFragment of non-element Node") + } + // The next check isn't just context.DataAtom.String() == context.Data because + // it is valid to pass an element whose tag isn't a known atom. For example, + // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. + if context.DataAtom != a.Lookup([]byte(context.Data)) { + return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) + } + contextTag = context.DataAtom.String() + } + p := &parser{ + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + fragment: true, + context: context, + } + if context != nil && context.Namespace != "" { + p.tokenizer = NewTokenizer(r) + } else { + p.tokenizer = NewTokenizerFragment(r, contextTag) + } + + for _, f := range opts { + f(p) + } + + root := &Node{ + Type: ElementNode, + DataAtom: a.Html, + Data: a.Html.String(), + } + p.doc.AppendChild(root) + p.oe = nodeStack{root} + if context != nil && context.DataAtom == a.Template { + p.templateStack = append(p.templateStack, inTemplateIM) + } + p.resetInsertionMode() + + for n := context; n != nil; n = n.Parent { + if n.Type == ElementNode && n.DataAtom == a.Form { + p.form = n + break + } + } + + if err := p.parse(); err != nil { + return nil, err + } + + parent := p.doc + if context != nil { + parent = root + } + + var result []*Node + for c := parent.FirstChild; c != nil; { + next := c.NextSibling + parent.RemoveChild(c) + result = append(result, c) + c = next + } + return result, nil +} diff --git a/vendor/golang.org/x/net/html/render.go b/vendor/golang.org/x/net/html/render.go new file mode 100644 index 0000000..b46d81c --- /dev/null +++ b/vendor/golang.org/x/net/html/render.go @@ -0,0 +1,273 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" +) + +type writer interface { + io.Writer + io.ByteWriter + WriteString(string) (int, error) +} + +// Render renders the parse tree n to the given writer. +// +// Rendering is done on a 'best effort' basis: calling Parse on the output of +// Render will always result in something similar to the original tree, but it +// is not necessarily an exact clone unless the original tree was 'well-formed'. +// 'Well-formed' is not easily specified; the HTML5 specification is +// complicated. +// +// Calling Parse on arbitrary input typically results in a 'well-formed' parse +// tree. However, it is possible for Parse to yield a 'badly-formed' parse tree. +// For example, in a 'well-formed' parse tree, no element is a child of +// another element: parsing "" results in two sibling elements. +// Similarly, in a 'well-formed' parse tree, no element is a child of a +// element: parsing "" results in a with two sibling +// children; the is reparented to the 's parent. However, calling +// Parse on "" does not return an error, but the result has an +// element with an child, and is therefore not 'well-formed'. +// +// Programmatically constructed trees are typically also 'well-formed', but it +// is possible to construct a tree that looks innocuous but, when rendered and +// re-parsed, results in a different tree. A simple example is that a solitary +// text node would become a tree containing , and elements. +// Another example is that the programmatic equivalent of "abc" +// becomes "abc". +func Render(w io.Writer, n *Node) error { + if x, ok := w.(writer); ok { + return render(x, n) + } + buf := bufio.NewWriter(w) + if err := render(buf, n); err != nil { + return err + } + return buf.Flush() +} + +// plaintextAbort is returned from render1 when a element +// has been rendered. No more end tags should be rendered after that. +var plaintextAbort = errors.New("html: internal error (plaintext abort)") + +func render(w writer, n *Node) error { + err := render1(w, n) + if err == plaintextAbort { + err = nil + } + return err +} + +func render1(w writer, n *Node) error { + // Render non-element nodes; these are the easy cases. + switch n.Type { + case ErrorNode: + return errors.New("html: cannot render an ErrorNode node") + case TextNode: + return escape(w, n.Data) + case DocumentNode: + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := render1(w, c); err != nil { + return err + } + } + return nil + case ElementNode: + // No-op. + case CommentNode: + if _, err := w.WriteString(""); err != nil { + return err + } + return nil + case DoctypeNode: + if _, err := w.WriteString("') + case RawNode: + _, err := w.WriteString(n.Data) + return err + default: + return errors.New("html: unknown node type") + } + + // Render the opening tag. + if err := w.WriteByte('<'); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + for _, a := range n.Attr { + if err := w.WriteByte(' '); err != nil { + return err + } + if a.Namespace != "" { + if _, err := w.WriteString(a.Namespace); err != nil { + return err + } + if err := w.WriteByte(':'); err != nil { + return err + } + } + if _, err := w.WriteString(a.Key); err != nil { + return err + } + if _, err := w.WriteString(`="`); err != nil { + return err + } + if err := escape(w, a.Val); err != nil { + return err + } + if err := w.WriteByte('"'); err != nil { + return err + } + } + if voidElements[n.Data] { + if n.FirstChild != nil { + return fmt.Errorf("html: void element <%s> has child nodes", n.Data) + } + _, err := w.WriteString("/>") + return err + } + if err := w.WriteByte('>'); err != nil { + return err + } + + // Add initial newline where there is danger of a newline beging ignored. + if c := n.FirstChild; c != nil && c.Type == TextNode && strings.HasPrefix(c.Data, "\n") { + switch n.Data { + case "pre", "listing", "textarea": + if err := w.WriteByte('\n'); err != nil { + return err + } + } + } + + // Render any child nodes. + switch n.Data { + case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "xmp": + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == TextNode { + if _, err := w.WriteString(c.Data); err != nil { + return err + } + } else { + if err := render1(w, c); err != nil { + return err + } + } + } + if n.Data == "plaintext" { + // Don't render anything else. must be the + // last element in the file, with no closing tag. + return plaintextAbort + } + default: + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := render1(w, c); err != nil { + return err + } + } + } + + // Render the closing tag. + if _, err := w.WriteString(""); err != nil { + return err + } + if _, err := w.WriteString(n.Data); err != nil { + return err + } + return w.WriteByte('>') +} + +// writeQuoted writes s to w surrounded by quotes. Normally it will use double +// quotes, but if s contains a double quote, it will use single quotes. +// It is used for writing the identifiers in a doctype declaration. +// In valid HTML, they can't contain both types of quotes. +func writeQuoted(w writer, s string) error { + var q byte = '"' + if strings.Contains(s, `"`) { + q = '\'' + } + if err := w.WriteByte(q); err != nil { + return err + } + if _, err := w.WriteString(s); err != nil { + return err + } + if err := w.WriteByte(q); err != nil { + return err + } + return nil +} + +// Section 12.1.2, "Elements", gives this list of void elements. Void elements +// are those that can't have any contents. +var voidElements = map[string]bool{ + "area": true, + "base": true, + "br": true, + "col": true, + "embed": true, + "hr": true, + "img": true, + "input": true, + "keygen": true, // "keygen" has been removed from the spec, but are kept here for backwards compatibility. + "link": true, + "meta": true, + "param": true, + "source": true, + "track": true, + "wbr": true, +} diff --git a/vendor/golang.org/x/net/html/token.go b/vendor/golang.org/x/net/html/token.go new file mode 100644 index 0000000..877709f --- /dev/null +++ b/vendor/golang.org/x/net/html/token.go @@ -0,0 +1,1224 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "errors" + "io" + "strconv" + "strings" + + "golang.org/x/net/html/atom" +) + +// A TokenType is the type of a Token. +type TokenType uint32 + +const ( + // ErrorToken means that an error occurred during tokenization. + ErrorToken TokenType = iota + // TextToken means a text node. + TextToken + // A StartTagToken looks like . + StartTagToken + // An EndTagToken looks like . + EndTagToken + // A SelfClosingTagToken tag looks like . + SelfClosingTagToken + // A CommentToken looks like . + CommentToken + // A DoctypeToken looks like + DoctypeToken +) + +// ErrBufferExceeded means that the buffering limit was exceeded. +var ErrBufferExceeded = errors.New("max buffer exceeded") + +// String returns a string representation of the TokenType. +func (t TokenType) String() string { + switch t { + case ErrorToken: + return "Error" + case TextToken: + return "Text" + case StartTagToken: + return "StartTag" + case EndTagToken: + return "EndTag" + case SelfClosingTagToken: + return "SelfClosingTag" + case CommentToken: + return "Comment" + case DoctypeToken: + return "Doctype" + } + return "Invalid(" + strconv.Itoa(int(t)) + ")" +} + +// An Attribute is an attribute namespace-key-value triple. Namespace is +// non-empty for foreign attributes like xlink, Key is alphabetic (and hence +// does not contain escapable characters like '&', '<' or '>'), and Val is +// unescaped (it looks like "a" + case EndTagToken: + return "" + t.tagString() + ">" + case SelfClosingTagToken: + return "<" + t.tagString() + "/>" + case CommentToken: + return "" + case DoctypeToken: + return "" + } + return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" +} + +// span is a range of bytes in a Tokenizer's buffer. The start is inclusive, +// the end is exclusive. +type span struct { + start, end int +} + +// A Tokenizer returns a stream of HTML Tokens. +type Tokenizer struct { + // r is the source of the HTML text. + r io.Reader + // tt is the TokenType of the current token. + tt TokenType + // err is the first error encountered during tokenization. It is possible + // for tt != Error && err != nil to hold: this means that Next returned a + // valid token but the subsequent Next call will return an error token. + // For example, if the HTML text input was just "plain", then the first + // Next call would set z.err to io.EOF but return a TextToken, and all + // subsequent Next calls would return an ErrorToken. + // err is never reset. Once it becomes non-nil, it stays non-nil. + err error + // readErr is the error returned by the io.Reader r. It is separate from + // err because it is valid for an io.Reader to return (n int, err1 error) + // such that n > 0 && err1 != nil, and callers should always process the + // n > 0 bytes before considering the error err1. + readErr error + // buf[raw.start:raw.end] holds the raw bytes of the current token. + // buf[raw.end:] is buffered input that will yield future tokens. + raw span + buf []byte + // maxBuf limits the data buffered in buf. A value of 0 means unlimited. + maxBuf int + // buf[data.start:data.end] holds the raw bytes of the current token's data: + // a text token's text, a tag token's tag name, etc. + data span + // pendingAttr is the attribute key and value currently being tokenized. + // When complete, pendingAttr is pushed onto attr. nAttrReturned is + // incremented on each call to TagAttr. + pendingAttr [2]span + attr [][2]span + nAttrReturned int + // rawTag is the "script" in "" that closes the next token. If + // non-empty, the subsequent call to Next will return a raw or RCDATA text + // token: one that treats "" as text instead of an element. + // rawTag's contents are lower-cased. + rawTag string + // textIsRaw is whether the current text token's data is not escaped. + textIsRaw bool + // convertNUL is whether NUL bytes in the current token's data should + // be converted into \ufffd replacement characters. + convertNUL bool + // allowCDATA is whether CDATA sections are allowed in the current context. + allowCDATA bool +} + +// AllowCDATA sets whether or not the tokenizer recognizes as +// the text "foo". The default value is false, which means to recognize it as +// a bogus comment "" instead. +// +// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and +// only if tokenizing foreign content, such as MathML and SVG. However, +// tracking foreign-contentness is difficult to do purely in the tokenizer, +// as opposed to the parser, due to HTML integration points: an element +// can contain a that is foreign-to-SVG but not foreign-to- +// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call AllowCDATA as appropriate. +// In practice, if using the tokenizer without caring whether MathML or SVG +// CDATA is text or comments, such as tokenizing HTML to find all the anchor +// text, it is acceptable to ignore this responsibility. +func (z *Tokenizer) AllowCDATA(allowCDATA bool) { + z.allowCDATA = allowCDATA +} + +// NextIsNotRawText instructs the tokenizer that the next token should not be +// considered as 'raw text'. Some elements, such as script and title elements, +// normally require the next token after the opening tag to be 'raw text' that +// has no child elements. For example, tokenizing "acd" +// yields a start tag token for "", a text token for "acd", and +// an end tag token for "". There are no distinct start tag or end tag +// tokens for the "" and "". +// +// This tokenizer implementation will generally look for raw text at the right +// times. Strictly speaking, an HTML5 compliant tokenizer should not look for +// raw text if in foreign content: generally needs raw text, but a +// inside an does not. Another example is that a +// generally needs raw text, but a is not allowed as an immediate +// child of a ; in normal parsing, a implies , but +// one cannot close the implicit element when parsing a 's InnerHTML. +// Similarly to AllowCDATA, tracking the correct moment to override raw-text- +// ness is difficult to do purely in the tokenizer, as opposed to the parser. +// For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call NextIsNotRawText as +// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this +// responsibility for basic usage. +// +// Note that this 'raw text' concept is different from the one offered by the +// Tokenizer.Raw method. +func (z *Tokenizer) NextIsNotRawText() { + z.rawTag = "" +} + +// Err returns the error associated with the most recent ErrorToken token. +// This is typically io.EOF, meaning the end of tokenization. +func (z *Tokenizer) Err() error { + if z.tt != ErrorToken { + return nil + } + return z.err +} + +// readByte returns the next byte from the input stream, doing a buffered read +// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte +// slice that holds all the bytes read so far for the current token. +// It sets z.err if the underlying reader returns an error. +// Pre-condition: z.err == nil. +func (z *Tokenizer) readByte() byte { + if z.raw.end >= len(z.buf) { + // Our buffer is exhausted and we have to read from z.r. Check if the + // previous read resulted in an error. + if z.readErr != nil { + z.err = z.readErr + return 0 + } + // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length + // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we + // allocate a new buffer before the copy. + c := cap(z.buf) + d := z.raw.end - z.raw.start + var buf1 []byte + if 2*d > c { + buf1 = make([]byte, d, 2*c) + } else { + buf1 = z.buf[:d] + } + copy(buf1, z.buf[z.raw.start:z.raw.end]) + if x := z.raw.start; x != 0 { + // Adjust the data/attr spans to refer to the same contents after the copy. + z.data.start -= x + z.data.end -= x + z.pendingAttr[0].start -= x + z.pendingAttr[0].end -= x + z.pendingAttr[1].start -= x + z.pendingAttr[1].end -= x + for i := range z.attr { + z.attr[i][0].start -= x + z.attr[i][0].end -= x + z.attr[i][1].start -= x + z.attr[i][1].end -= x + } + } + z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d] + // Now that we have copied the live bytes to the start of the buffer, + // we read from z.r into the remainder. + var n int + n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)]) + if n == 0 { + z.err = z.readErr + return 0 + } + z.buf = buf1[:d+n] + } + x := z.buf[z.raw.end] + z.raw.end++ + if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf { + z.err = ErrBufferExceeded + return 0 + } + return x +} + +// Buffered returns a slice containing data buffered but not yet tokenized. +func (z *Tokenizer) Buffered() []byte { + return z.buf[z.raw.end:] +} + +// readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil). +// It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil) +// too many times in succession. +func readAtLeastOneByte(r io.Reader, b []byte) (int, error) { + for i := 0; i < 100; i++ { + if n, err := r.Read(b); n != 0 || err != nil { + return n, err + } + } + return 0, io.ErrNoProgress +} + +// skipWhiteSpace skips past any white space. +func (z *Tokenizer) skipWhiteSpace() { + if z.err != nil { + return + } + for { + c := z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + // No-op. + default: + z.raw.end-- + return + } + } +} + +// readRawOrRCDATA reads until the next "", where "foo" is z.rawTag and +// is typically something like "script" or "textarea". +func (z *Tokenizer) readRawOrRCDATA() { + if z.rawTag == "script" { + z.readScript() + z.textIsRaw = true + z.rawTag = "" + return + } +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + c = z.readByte() + if z.err != nil { + break loop + } + if c != '/' { + z.raw.end-- + continue loop + } + if z.readRawEndTag() || z.err != nil { + break loop + } + } + z.data.end = z.raw.end + // A textarea's or title's RCDATA can contain escaped entities. + z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title" + z.rawTag = "" +} + +// readRawEndTag attempts to read a tag like "", where "foo" is z.rawTag. +// If it succeeds, it backs up the input position to reconsume the tag and +// returns true. Otherwise it returns false. The opening "" has already been +// consumed. +func (z *Tokenizer) readRawEndTag() bool { + for i := 0; i < len(z.rawTag); i++ { + c := z.readByte() + if z.err != nil { + return false + } + if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { + z.raw.end-- + return false + } + } + c := z.readByte() + if z.err != nil { + return false + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + // The 3 is 2 for the leading "" plus 1 for the trailing character c. + z.raw.end -= 3 + len(z.rawTag) + return true + } + z.raw.end-- + return false +} + +// readScript reads until the next tag, following the byzantine +// rules for escaping/hiding the closing tag. +func (z *Tokenizer) readScript() { + defer func() { + z.data.end = z.raw.end + }() + var c byte + +scriptData: + c = z.readByte() + if z.err != nil { + return + } + if c == '<' { + goto scriptDataLessThanSign + } + goto scriptData + +scriptDataLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '/': + goto scriptDataEndTagOpen + case '!': + goto scriptDataEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptData + +scriptDataEscapeStart: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapeStartDash + } + z.raw.end-- + goto scriptData + +scriptDataEscapeStartDash: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapedDashDash + } + z.raw.end-- + goto scriptData + +scriptDataEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataEscaped + +scriptDataEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataEscapedEndTagOpen + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + goto scriptDataDoubleEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEscapedEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptDataEscaped + +scriptDataDoubleEscapeStart: + z.raw.end-- + for i := 0; i < len("script"); i++ { + c = z.readByte() + if z.err != nil { + return + } + if c != "script"[i] && c != "SCRIPT"[i] { + z.raw.end-- + goto scriptDataEscaped + } + } + c = z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + goto scriptDataDoubleEscaped + } + z.raw.end-- + goto scriptDataEscaped + +scriptDataDoubleEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataDoubleEscapeEnd + } + z.raw.end-- + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapeEnd: + if z.readRawEndTag() { + z.raw.end += len("") + goto scriptDataEscaped + } + if z.err != nil { + return + } + goto scriptDataDoubleEscaped +} + +// readComment reads the next comment token starting with ". + z.data.end = z.data.start + } + }() + for dashCount := 2; ; { + c := z.readByte() + if z.err != nil { + // Ignore up to two dashes at EOF. + if dashCount > 2 { + dashCount = 2 + } + z.data.end = z.raw.end - dashCount + return + } + switch c { + case '-': + dashCount++ + continue + case '>': + if dashCount >= 2 { + z.data.end = z.raw.end - len("-->") + return + } + case '!': + if dashCount >= 2 { + c = z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len("--!>") + return + } + } + } + dashCount = 0 + } +} + +// readUntilCloseAngle reads until the next ">". +func (z *Tokenizer) readUntilCloseAngle() { + z.data.start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len(">") + return + } + } +} + +// readMarkupDeclaration reads the next token starting with "", a "", a "" or +// "': + if brackets >= 2 { + z.data.end = z.raw.end - len("]]>") + return true + } + brackets = 0 + default: + brackets = 0 + } + } +} + +// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end] +// case-insensitively matches any element of ss. +func (z *Tokenizer) startTagIn(ss ...string) bool { +loop: + for _, s := range ss { + if z.data.end-z.data.start != len(s) { + continue loop + } + for i := 0; i < len(s); i++ { + c := z.buf[z.data.start+i] + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + if c != s[i] { + continue loop + } + } + return true + } + return false +} + +// readStartTag reads the next start tag token. The opening "". + if z.err == nil && z.buf[z.raw.end-2] == '/' { + return SelfClosingTagToken + } + return StartTagToken +} + +// readTag reads the next tag token and its attributes. If saveAttr, those +// attributes are saved in z.attr, otherwise z.attr is set to an empty slice. +// The opening "' { + break + } + z.raw.end-- + z.readTagAttrKey() + z.readTagAttrVal() + // Save pendingAttr if saveAttr and that attribute has a non-empty key. + if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end { + z.attr = append(z.attr, z.pendingAttr) + } + if z.skipWhiteSpace(); z.err != nil { + break + } + } +} + +// readTagName sets z.data to the "div" in "". The reader (z.raw.end) +// is positioned such that the first byte of the tag name (the "d" in "': + z.raw.end-- + z.data.end = z.raw.end + return + } + } +} + +// readTagAttrKey sets z.pendingAttr[0] to the "k" in "". +// Precondition: z.err == nil. +func (z *Tokenizer) readTagAttrKey() { + z.pendingAttr[0].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[0].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/': + z.pendingAttr[0].end = z.raw.end - 1 + return + case '=', '>': + z.raw.end-- + z.pendingAttr[0].end = z.raw.end + return + } + } +} + +// readTagAttrVal sets z.pendingAttr[1] to the "v" in "". +func (z *Tokenizer) readTagAttrVal() { + z.pendingAttr[1].start = z.raw.end + z.pendingAttr[1].end = z.raw.end + if z.skipWhiteSpace(); z.err != nil { + return + } + c := z.readByte() + if z.err != nil { + return + } + if c != '=' { + z.raw.end-- + return + } + if z.skipWhiteSpace(); z.err != nil { + return + } + quote := z.readByte() + if z.err != nil { + return + } + switch quote { + case '>': + z.raw.end-- + return + + case '\'', '"': + z.pendingAttr[1].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + if c == quote { + z.pendingAttr[1].end = z.raw.end - 1 + return + } + } + + default: + z.pendingAttr[1].start = z.raw.end - 1 + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + z.pendingAttr[1].end = z.raw.end - 1 + return + case '>': + z.raw.end-- + z.pendingAttr[1].end = z.raw.end + return + } + } + } +} + +// Next scans the next token and returns its type. +func (z *Tokenizer) Next() TokenType { + z.raw.start = z.raw.end + z.data.start = z.raw.end + z.data.end = z.raw.end + if z.err != nil { + z.tt = ErrorToken + return z.tt + } + if z.rawTag != "" { + if z.rawTag == "plaintext" { + // Read everything up to EOF. + for z.err == nil { + z.readByte() + } + z.data.end = z.raw.end + z.textIsRaw = true + } else { + z.readRawOrRCDATA() + } + if z.data.end > z.data.start { + z.tt = TextToken + z.convertNUL = true + return z.tt + } + } + z.textIsRaw = false + z.convertNUL = false + +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + + // Check if the '<' we have just read is part of a tag, comment + // or doctype. If not, it's part of the accumulated text token. + c = z.readByte() + if z.err != nil { + break loop + } + var tokenType TokenType + switch { + case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': + tokenType = StartTagToken + case c == '/': + tokenType = EndTagToken + case c == '!' || c == '?': + // We use CommentToken to mean any of "", + // "" and "". + tokenType = CommentToken + default: + // Reconsume the current character. + z.raw.end-- + continue + } + + // We have a non-text token, but we might have accumulated some text + // before that. If so, we return the text first, and return the non- + // text token on the subsequent call to Next. + if x := z.raw.end - len("' { + // ">" does not generate a token at all. Generate an empty comment + // to allow passthrough clients to pick up the data using Raw. + // Reset the tokenizer state and start again. + z.tt = CommentToken + return z.tt + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + z.readTag(false) + if z.err != nil { + z.tt = ErrorToken + } else { + z.tt = EndTagToken + } + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + case CommentToken: + if c == '!' { + z.tt = z.readMarkupDeclaration() + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + } + } + if z.raw.start < z.raw.end { + z.data.end = z.raw.end + z.tt = TextToken + return z.tt + } + z.tt = ErrorToken + return z.tt +} + +// Raw returns the unmodified text of the current token. Calling Next, Token, +// Text, TagName or TagAttr may change the contents of the returned slice. +// +// The token stream's raw bytes partition the byte stream (up until an +// ErrorToken). There are no overlaps or gaps between two consecutive token's +// raw bytes. One implication is that the byte offset of the current token is +// the sum of the lengths of all previous tokens' raw bytes. +func (z *Tokenizer) Raw() []byte { + return z.buf[z.raw.start:z.raw.end] +} + +// convertNewlines converts "\r" and "\r\n" in s to "\n". +// The conversion happens in place, but the resulting slice may be shorter. +func convertNewlines(s []byte) []byte { + for i, c := range s { + if c != '\r' { + continue + } + + src := i + 1 + if src >= len(s) || s[src] != '\n' { + s[i] = '\n' + continue + } + + dst := i + for src < len(s) { + if s[src] == '\r' { + if src+1 < len(s) && s[src+1] == '\n' { + src++ + } + s[dst] = '\n' + } else { + s[dst] = s[src] + } + src++ + dst++ + } + return s[:dst] + } + return s +} + +var ( + nul = []byte("\x00") + replacement = []byte("\ufffd") +) + +// Text returns the unescaped text of a text, comment or doctype token. The +// contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) Text() []byte { + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + s = convertNewlines(s) + if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) { + s = bytes.Replace(s, nul, replacement, -1) + } + if !z.textIsRaw { + s = unescape(s, false) + } + return s + } + return nil +} + +// TagName returns the lower-cased name of a tag token (the `img` out of +// ``) and whether the tag has attributes. +// The contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { + if z.data.start < z.data.end { + switch z.tt { + case StartTagToken, EndTagToken, SelfClosingTagToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + return lower(s), z.nAttrReturned < len(z.attr) + } + } + return nil, false +} + +// TagAttr returns the lower-cased key and unescaped value of the next unparsed +// attribute for the current tag token and whether there are more attributes. +// The contents of the returned slices may change on the next call to Next. +func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { + if z.nAttrReturned < len(z.attr) { + switch z.tt { + case StartTagToken, SelfClosingTagToken: + x := z.attr[z.nAttrReturned] + z.nAttrReturned++ + key = z.buf[x[0].start:x[0].end] + val = z.buf[x[1].start:x[1].end] + return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr) + } + } + return nil, nil, false +} + +// Token returns the current Token. The result's Data and Attr values remain +// valid after subsequent Next calls. +func (z *Tokenizer) Token() Token { + t := Token{Type: z.tt} + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + t.Data = string(z.Text()) + case StartTagToken, SelfClosingTagToken, EndTagToken: + name, moreAttr := z.TagName() + for moreAttr { + var key, val []byte + key, val, moreAttr = z.TagAttr() + t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) + } + if a := atom.Lookup(name); a != 0 { + t.DataAtom, t.Data = a, a.String() + } else { + t.DataAtom, t.Data = 0, string(name) + } + } + return t +} + +// SetMaxBuf sets a limit on the amount of data buffered during tokenization. +// A value of 0 means unlimited. +func (z *Tokenizer) SetMaxBuf(n int) { + z.maxBuf = n +} + +// NewTokenizer returns a new HTML Tokenizer for the given Reader. +// The input is assumed to be UTF-8 encoded. +func NewTokenizer(r io.Reader) *Tokenizer { + return NewTokenizerFragment(r, "") +} + +// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for +// tokenizing an existing element's InnerHTML fragment. contextTag is that +// element's tag, such as "div" or "iframe". +// +// For example, how the InnerHTML "a tag or a
with two sibling +// children; the is reparented to the
" as text instead of an element. + // rawTag's contents are lower-cased. + rawTag string + // textIsRaw is whether the current text token's data is not escaped. + textIsRaw bool + // convertNUL is whether NUL bytes in the current token's data should + // be converted into \ufffd replacement characters. + convertNUL bool + // allowCDATA is whether CDATA sections are allowed in the current context. + allowCDATA bool +} + +// AllowCDATA sets whether or not the tokenizer recognizes as +// the text "foo". The default value is false, which means to recognize it as +// a bogus comment "" instead. +// +// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and +// only if tokenizing foreign content, such as MathML and SVG. However, +// tracking foreign-contentness is difficult to do purely in the tokenizer, +// as opposed to the parser, due to HTML integration points: an element +// can contain a that is foreign-to-SVG but not foreign-to- +// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call AllowCDATA as appropriate. +// In practice, if using the tokenizer without caring whether MathML or SVG +// CDATA is text or comments, such as tokenizing HTML to find all the anchor +// text, it is acceptable to ignore this responsibility. +func (z *Tokenizer) AllowCDATA(allowCDATA bool) { + z.allowCDATA = allowCDATA +} + +// NextIsNotRawText instructs the tokenizer that the next token should not be +// considered as 'raw text'. Some elements, such as script and title elements, +// normally require the next token after the opening tag to be 'raw text' that +// has no child elements. For example, tokenizing "acd" +// yields a start tag token for "", a text token for "acd", and +// an end tag token for "". There are no distinct start tag or end tag +// tokens for the "" and "". +// +// This tokenizer implementation will generally look for raw text at the right +// times. Strictly speaking, an HTML5 compliant tokenizer should not look for +// raw text if in foreign content: generally needs raw text, but a +// inside an does not. Another example is that a +// generally needs raw text, but a is not allowed as an immediate +// child of a ; in normal parsing, a implies , but +// one cannot close the implicit element when parsing a 's InnerHTML. +// Similarly to AllowCDATA, tracking the correct moment to override raw-text- +// ness is difficult to do purely in the tokenizer, as opposed to the parser. +// For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call NextIsNotRawText as +// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this +// responsibility for basic usage. +// +// Note that this 'raw text' concept is different from the one offered by the +// Tokenizer.Raw method. +func (z *Tokenizer) NextIsNotRawText() { + z.rawTag = "" +} + +// Err returns the error associated with the most recent ErrorToken token. +// This is typically io.EOF, meaning the end of tokenization. +func (z *Tokenizer) Err() error { + if z.tt != ErrorToken { + return nil + } + return z.err +} + +// readByte returns the next byte from the input stream, doing a buffered read +// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte +// slice that holds all the bytes read so far for the current token. +// It sets z.err if the underlying reader returns an error. +// Pre-condition: z.err == nil. +func (z *Tokenizer) readByte() byte { + if z.raw.end >= len(z.buf) { + // Our buffer is exhausted and we have to read from z.r. Check if the + // previous read resulted in an error. + if z.readErr != nil { + z.err = z.readErr + return 0 + } + // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length + // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we + // allocate a new buffer before the copy. + c := cap(z.buf) + d := z.raw.end - z.raw.start + var buf1 []byte + if 2*d > c { + buf1 = make([]byte, d, 2*c) + } else { + buf1 = z.buf[:d] + } + copy(buf1, z.buf[z.raw.start:z.raw.end]) + if x := z.raw.start; x != 0 { + // Adjust the data/attr spans to refer to the same contents after the copy. + z.data.start -= x + z.data.end -= x + z.pendingAttr[0].start -= x + z.pendingAttr[0].end -= x + z.pendingAttr[1].start -= x + z.pendingAttr[1].end -= x + for i := range z.attr { + z.attr[i][0].start -= x + z.attr[i][0].end -= x + z.attr[i][1].start -= x + z.attr[i][1].end -= x + } + } + z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d] + // Now that we have copied the live bytes to the start of the buffer, + // we read from z.r into the remainder. + var n int + n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)]) + if n == 0 { + z.err = z.readErr + return 0 + } + z.buf = buf1[:d+n] + } + x := z.buf[z.raw.end] + z.raw.end++ + if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf { + z.err = ErrBufferExceeded + return 0 + } + return x +} + +// Buffered returns a slice containing data buffered but not yet tokenized. +func (z *Tokenizer) Buffered() []byte { + return z.buf[z.raw.end:] +} + +// readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil). +// It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil) +// too many times in succession. +func readAtLeastOneByte(r io.Reader, b []byte) (int, error) { + for i := 0; i < 100; i++ { + if n, err := r.Read(b); n != 0 || err != nil { + return n, err + } + } + return 0, io.ErrNoProgress +} + +// skipWhiteSpace skips past any white space. +func (z *Tokenizer) skipWhiteSpace() { + if z.err != nil { + return + } + for { + c := z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + // No-op. + default: + z.raw.end-- + return + } + } +} + +// readRawOrRCDATA reads until the next "", where "foo" is z.rawTag and +// is typically something like "script" or "textarea". +func (z *Tokenizer) readRawOrRCDATA() { + if z.rawTag == "script" { + z.readScript() + z.textIsRaw = true + z.rawTag = "" + return + } +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + c = z.readByte() + if z.err != nil { + break loop + } + if c != '/' { + z.raw.end-- + continue loop + } + if z.readRawEndTag() || z.err != nil { + break loop + } + } + z.data.end = z.raw.end + // A textarea's or title's RCDATA can contain escaped entities. + z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title" + z.rawTag = "" +} + +// readRawEndTag attempts to read a tag like "", where "foo" is z.rawTag. +// If it succeeds, it backs up the input position to reconsume the tag and +// returns true. Otherwise it returns false. The opening "" has already been +// consumed. +func (z *Tokenizer) readRawEndTag() bool { + for i := 0; i < len(z.rawTag); i++ { + c := z.readByte() + if z.err != nil { + return false + } + if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { + z.raw.end-- + return false + } + } + c := z.readByte() + if z.err != nil { + return false + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + // The 3 is 2 for the leading "" plus 1 for the trailing character c. + z.raw.end -= 3 + len(z.rawTag) + return true + } + z.raw.end-- + return false +} + +// readScript reads until the next tag, following the byzantine +// rules for escaping/hiding the closing tag. +func (z *Tokenizer) readScript() { + defer func() { + z.data.end = z.raw.end + }() + var c byte + +scriptData: + c = z.readByte() + if z.err != nil { + return + } + if c == '<' { + goto scriptDataLessThanSign + } + goto scriptData + +scriptDataLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '/': + goto scriptDataEndTagOpen + case '!': + goto scriptDataEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptData + +scriptDataEscapeStart: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapeStartDash + } + z.raw.end-- + goto scriptData + +scriptDataEscapeStartDash: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapedDashDash + } + z.raw.end-- + goto scriptData + +scriptDataEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataEscaped + +scriptDataEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataEscapedEndTagOpen + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + goto scriptDataDoubleEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEscapedEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptDataEscaped + +scriptDataDoubleEscapeStart: + z.raw.end-- + for i := 0; i < len("script"); i++ { + c = z.readByte() + if z.err != nil { + return + } + if c != "script"[i] && c != "SCRIPT"[i] { + z.raw.end-- + goto scriptDataEscaped + } + } + c = z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + goto scriptDataDoubleEscaped + } + z.raw.end-- + goto scriptDataEscaped + +scriptDataDoubleEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataDoubleEscapeEnd + } + z.raw.end-- + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapeEnd: + if z.readRawEndTag() { + z.raw.end += len("") + goto scriptDataEscaped + } + if z.err != nil { + return + } + goto scriptDataDoubleEscaped +} + +// readComment reads the next comment token starting with ". + z.data.end = z.data.start + } + }() + for dashCount := 2; ; { + c := z.readByte() + if z.err != nil { + // Ignore up to two dashes at EOF. + if dashCount > 2 { + dashCount = 2 + } + z.data.end = z.raw.end - dashCount + return + } + switch c { + case '-': + dashCount++ + continue + case '>': + if dashCount >= 2 { + z.data.end = z.raw.end - len("-->") + return + } + case '!': + if dashCount >= 2 { + c = z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len("--!>") + return + } + } + } + dashCount = 0 + } +} + +// readUntilCloseAngle reads until the next ">". +func (z *Tokenizer) readUntilCloseAngle() { + z.data.start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return + } + if c == '>' { + z.data.end = z.raw.end - len(">") + return + } + } +} + +// readMarkupDeclaration reads the next token starting with "", a "", a "" or +// "': + if brackets >= 2 { + z.data.end = z.raw.end - len("]]>") + return true + } + brackets = 0 + default: + brackets = 0 + } + } +} + +// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end] +// case-insensitively matches any element of ss. +func (z *Tokenizer) startTagIn(ss ...string) bool { +loop: + for _, s := range ss { + if z.data.end-z.data.start != len(s) { + continue loop + } + for i := 0; i < len(s); i++ { + c := z.buf[z.data.start+i] + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + if c != s[i] { + continue loop + } + } + return true + } + return false +} + +// readStartTag reads the next start tag token. The opening "". + if z.err == nil && z.buf[z.raw.end-2] == '/' { + return SelfClosingTagToken + } + return StartTagToken +} + +// readTag reads the next tag token and its attributes. If saveAttr, those +// attributes are saved in z.attr, otherwise z.attr is set to an empty slice. +// The opening "' { + break + } + z.raw.end-- + z.readTagAttrKey() + z.readTagAttrVal() + // Save pendingAttr if saveAttr and that attribute has a non-empty key. + if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end { + z.attr = append(z.attr, z.pendingAttr) + } + if z.skipWhiteSpace(); z.err != nil { + break + } + } +} + +// readTagName sets z.data to the "div" in "". The reader (z.raw.end) +// is positioned such that the first byte of the tag name (the "d" in "': + z.raw.end-- + z.data.end = z.raw.end + return + } + } +} + +// readTagAttrKey sets z.pendingAttr[0] to the "k" in "". +// Precondition: z.err == nil. +func (z *Tokenizer) readTagAttrKey() { + z.pendingAttr[0].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[0].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/': + z.pendingAttr[0].end = z.raw.end - 1 + return + case '=', '>': + z.raw.end-- + z.pendingAttr[0].end = z.raw.end + return + } + } +} + +// readTagAttrVal sets z.pendingAttr[1] to the "v" in "". +func (z *Tokenizer) readTagAttrVal() { + z.pendingAttr[1].start = z.raw.end + z.pendingAttr[1].end = z.raw.end + if z.skipWhiteSpace(); z.err != nil { + return + } + c := z.readByte() + if z.err != nil { + return + } + if c != '=' { + z.raw.end-- + return + } + if z.skipWhiteSpace(); z.err != nil { + return + } + quote := z.readByte() + if z.err != nil { + return + } + switch quote { + case '>': + z.raw.end-- + return + + case '\'', '"': + z.pendingAttr[1].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + if c == quote { + z.pendingAttr[1].end = z.raw.end - 1 + return + } + } + + default: + z.pendingAttr[1].start = z.raw.end - 1 + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f': + z.pendingAttr[1].end = z.raw.end - 1 + return + case '>': + z.raw.end-- + z.pendingAttr[1].end = z.raw.end + return + } + } + } +} + +// Next scans the next token and returns its type. +func (z *Tokenizer) Next() TokenType { + z.raw.start = z.raw.end + z.data.start = z.raw.end + z.data.end = z.raw.end + if z.err != nil { + z.tt = ErrorToken + return z.tt + } + if z.rawTag != "" { + if z.rawTag == "plaintext" { + // Read everything up to EOF. + for z.err == nil { + z.readByte() + } + z.data.end = z.raw.end + z.textIsRaw = true + } else { + z.readRawOrRCDATA() + } + if z.data.end > z.data.start { + z.tt = TextToken + z.convertNUL = true + return z.tt + } + } + z.textIsRaw = false + z.convertNUL = false + +loop: + for { + c := z.readByte() + if z.err != nil { + break loop + } + if c != '<' { + continue loop + } + + // Check if the '<' we have just read is part of a tag, comment + // or doctype. If not, it's part of the accumulated text token. + c = z.readByte() + if z.err != nil { + break loop + } + var tokenType TokenType + switch { + case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': + tokenType = StartTagToken + case c == '/': + tokenType = EndTagToken + case c == '!' || c == '?': + // We use CommentToken to mean any of "", + // "" and "". + tokenType = CommentToken + default: + // Reconsume the current character. + z.raw.end-- + continue + } + + // We have a non-text token, but we might have accumulated some text + // before that. If so, we return the text first, and return the non- + // text token on the subsequent call to Next. + if x := z.raw.end - len("' { + // ">" does not generate a token at all. Generate an empty comment + // to allow passthrough clients to pick up the data using Raw. + // Reset the tokenizer state and start again. + z.tt = CommentToken + return z.tt + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + z.readTag(false) + if z.err != nil { + z.tt = ErrorToken + } else { + z.tt = EndTagToken + } + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + case CommentToken: + if c == '!' { + z.tt = z.readMarkupDeclaration() + return z.tt + } + z.raw.end-- + z.readUntilCloseAngle() + z.tt = CommentToken + return z.tt + } + } + if z.raw.start < z.raw.end { + z.data.end = z.raw.end + z.tt = TextToken + return z.tt + } + z.tt = ErrorToken + return z.tt +} + +// Raw returns the unmodified text of the current token. Calling Next, Token, +// Text, TagName or TagAttr may change the contents of the returned slice. +// +// The token stream's raw bytes partition the byte stream (up until an +// ErrorToken). There are no overlaps or gaps between two consecutive token's +// raw bytes. One implication is that the byte offset of the current token is +// the sum of the lengths of all previous tokens' raw bytes. +func (z *Tokenizer) Raw() []byte { + return z.buf[z.raw.start:z.raw.end] +} + +// convertNewlines converts "\r" and "\r\n" in s to "\n". +// The conversion happens in place, but the resulting slice may be shorter. +func convertNewlines(s []byte) []byte { + for i, c := range s { + if c != '\r' { + continue + } + + src := i + 1 + if src >= len(s) || s[src] != '\n' { + s[i] = '\n' + continue + } + + dst := i + for src < len(s) { + if s[src] == '\r' { + if src+1 < len(s) && s[src+1] == '\n' { + src++ + } + s[dst] = '\n' + } else { + s[dst] = s[src] + } + src++ + dst++ + } + return s[:dst] + } + return s +} + +var ( + nul = []byte("\x00") + replacement = []byte("\ufffd") +) + +// Text returns the unescaped text of a text, comment or doctype token. The +// contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) Text() []byte { + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + s = convertNewlines(s) + if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) { + s = bytes.Replace(s, nul, replacement, -1) + } + if !z.textIsRaw { + s = unescape(s, false) + } + return s + } + return nil +} + +// TagName returns the lower-cased name of a tag token (the `img` out of +// ``) and whether the tag has attributes. +// The contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { + if z.data.start < z.data.end { + switch z.tt { + case StartTagToken, EndTagToken, SelfClosingTagToken: + s := z.buf[z.data.start:z.data.end] + z.data.start = z.raw.end + z.data.end = z.raw.end + return lower(s), z.nAttrReturned < len(z.attr) + } + } + return nil, false +} + +// TagAttr returns the lower-cased key and unescaped value of the next unparsed +// attribute for the current tag token and whether there are more attributes. +// The contents of the returned slices may change on the next call to Next. +func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { + if z.nAttrReturned < len(z.attr) { + switch z.tt { + case StartTagToken, SelfClosingTagToken: + x := z.attr[z.nAttrReturned] + z.nAttrReturned++ + key = z.buf[x[0].start:x[0].end] + val = z.buf[x[1].start:x[1].end] + return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr) + } + } + return nil, nil, false +} + +// Token returns the current Token. The result's Data and Attr values remain +// valid after subsequent Next calls. +func (z *Tokenizer) Token() Token { + t := Token{Type: z.tt} + switch z.tt { + case TextToken, CommentToken, DoctypeToken: + t.Data = string(z.Text()) + case StartTagToken, SelfClosingTagToken, EndTagToken: + name, moreAttr := z.TagName() + for moreAttr { + var key, val []byte + key, val, moreAttr = z.TagAttr() + t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) + } + if a := atom.Lookup(name); a != 0 { + t.DataAtom, t.Data = a, a.String() + } else { + t.DataAtom, t.Data = 0, string(name) + } + } + return t +} + +// SetMaxBuf sets a limit on the amount of data buffered during tokenization. +// A value of 0 means unlimited. +func (z *Tokenizer) SetMaxBuf(n int) { + z.maxBuf = n +} + +// NewTokenizer returns a new HTML Tokenizer for the given Reader. +// The input is assumed to be UTF-8 encoded. +func NewTokenizer(r io.Reader) *Tokenizer { + return NewTokenizerFragment(r, "") +} + +// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for +// tokenizing an existing element's InnerHTML fragment. contextTag is that +// element's tag, such as "div" or "iframe". +// +// For example, how the InnerHTML "a tag or a