diff options
Diffstat (limited to 'vendor/github.com/microcosm-cc/bluemonday/sanitize.go')
-rw-r--r-- | vendor/github.com/microcosm-cc/bluemonday/sanitize.go | 535 |
1 files changed, 535 insertions, 0 deletions
diff --git a/vendor/github.com/microcosm-cc/bluemonday/sanitize.go b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go new file mode 100644 index 0000000000..8983685818 --- /dev/null +++ b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go @@ -0,0 +1,535 @@ +// Copyright (c) 2014, David Kitchen <david@buro9.com> +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the organisation (Microcosm) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package bluemonday + +import ( + "bytes" + "io" + "net/url" + "strings" + + "golang.org/x/net/html" +) + +// Sanitize takes a string that contains a HTML fragment or document and applies +// the given policy whitelist. +// +// It returns a HTML string that has been sanitized by the policy or an empty +// string if an error has occurred (most likely as a consequence of extremely +// malformed input) +func (p *Policy) Sanitize(s string) string { + if strings.TrimSpace(s) == "" { + return s + } + + return p.sanitize(strings.NewReader(s)).String() +} + +// SanitizeBytes takes a []byte that contains a HTML fragment or document and applies +// the given policy whitelist. +// +// It returns a []byte containing the HTML that has been sanitized by the policy +// or an empty []byte if an error has occurred (most likely as a consequence of +// extremely malformed input) +func (p *Policy) SanitizeBytes(b []byte) []byte { + if len(bytes.TrimSpace(b)) == 0 { + return b + } + + return p.sanitize(bytes.NewReader(b)).Bytes() +} + +// SanitizeReader takes an io.Reader that contains a HTML fragment or document +// and applies the given policy whitelist. +// +// It returns a bytes.Buffer containing the HTML that has been sanitized by the +// policy. Errors during sanitization will merely return an empty result. +func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer { + return p.sanitize(r) +} + +// Performs the actual sanitization process. +func (p *Policy) sanitize(r io.Reader) *bytes.Buffer { + + // It is possible that the developer has created the policy via: + // p := bluemonday.Policy{} + // rather than: + // p := bluemonday.NewPolicy() + // If this is the case, and if they haven't yet triggered an action that + // would initiliaze the maps, then we need to do that. + p.init() + + var ( + buff bytes.Buffer + skipElementContent bool + skippingElementsCount int64 + skipClosingTag bool + closingTagToSkipStack []string + mostRecentlyStartedToken string + ) + + tokenizer := html.NewTokenizer(r) + for { + if tokenizer.Next() == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + // End of input means end of processing + return &buff + } + + // Raw tokenizer error + return &bytes.Buffer{} + } + + token := tokenizer.Token() + switch token.Type { + case html.DoctypeToken: + + if p.allowDocType { + buff.WriteString(token.String()) + } + + case html.CommentToken: + + // Comments are ignored by default + + case html.StartTagToken: + + mostRecentlyStartedToken = token.Data + + aps, ok := p.elsAndAttrs[token.Data] + if !ok { + if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { + skipElementContent = true + skippingElementsCount++ + } + if p.addSpaces { + buff.WriteString(" ") + } + break + } + + if len(token.Attr) != 0 { + token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) + } + + if len(token.Attr) == 0 { + if !p.allowNoAttrs(token.Data) { + skipClosingTag = true + closingTagToSkipStack = append(closingTagToSkipStack, token.Data) + if p.addSpaces { + buff.WriteString(" ") + } + break + } + } + + if !skipElementContent { + buff.WriteString(token.String()) + } + + case html.EndTagToken: + + if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data { + closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1] + if len(closingTagToSkipStack) == 0 { + skipClosingTag = false + } + if p.addSpaces { + buff.WriteString(" ") + } + break + } + + if _, ok := p.elsAndAttrs[token.Data]; !ok { + if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { + skippingElementsCount-- + if skippingElementsCount == 0 { + skipElementContent = false + } + } + if p.addSpaces { + buff.WriteString(" ") + } + break + } + + if !skipElementContent { + buff.WriteString(token.String()) + } + + case html.SelfClosingTagToken: + + aps, ok := p.elsAndAttrs[token.Data] + if !ok { + if p.addSpaces { + buff.WriteString(" ") + } + break + } + + if len(token.Attr) != 0 { + token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) + } + + if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) { + if p.addSpaces { + buff.WriteString(" ") + } + break + } + + if !skipElementContent { + buff.WriteString(token.String()) + } + + case html.TextToken: + + if !skipElementContent { + switch strings.ToLower(mostRecentlyStartedToken) { + case "javascript": + // not encouraged, but if a policy allows JavaScript we + // should not HTML escape it as that would break the output + buff.WriteString(token.Data) + case "style": + // not encouraged, but if a policy allows CSS styles we + // should not HTML escape it as that would break the output + buff.WriteString(token.Data) + default: + // HTML escape the text + buff.WriteString(token.String()) + } + } + + default: + // A token that didn't exist in the html package when we wrote this + return &bytes.Buffer{} + } + } +} + +// sanitizeAttrs takes a set of element attribute policies and the global +// attribute policies and applies them to the []html.Attribute returning a set +// of html.Attributes that match the policies +func (p *Policy) sanitizeAttrs( + elementName string, + attrs []html.Attribute, + aps map[string]attrPolicy, +) []html.Attribute { + + if len(attrs) == 0 { + return attrs + } + + // Builds a new attribute slice based on the whether the attribute has been + // whitelisted explicitly or globally. + cleanAttrs := []html.Attribute{} + for _, htmlAttr := range attrs { + // Is there an element specific attribute policy that applies? + if ap, ok := aps[htmlAttr.Key]; ok { + if ap.regexp != nil { + if ap.regexp.MatchString(htmlAttr.Val) { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue + } + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue + } + } + + // Is there a global attribute policy that applies? + if ap, ok := p.globalAttrs[htmlAttr.Key]; ok { + if ap.regexp != nil { + if ap.regexp.MatchString(htmlAttr.Val) { + cleanAttrs = append(cleanAttrs, htmlAttr) + } + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + } + } + } + + if len(cleanAttrs) == 0 { + // If nothing was allowed, let's get out of here + return cleanAttrs + } + // cleanAttrs now contains the attributes that are permitted + + if linkable(elementName) { + if p.requireParseableURLs { + // Ensure URLs are parseable: + // - a.href + // - area.href + // - link.href + // - blockquote.cite + // - q.cite + // - img.src + // - script.src + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + switch elementName { + case "a", "area", "link": + if htmlAttr.Key == "href" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + case "blockquote", "q": + if htmlAttr.Key == "cite" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + case "img", "script": + if htmlAttr.Key == "src" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + default: + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + cleanAttrs = tmpAttrs + } + + if (p.requireNoFollow || + p.requireNoFollowFullyQualifiedLinks || + p.addTargetBlankToFullyQualifiedLinks) && + len(cleanAttrs) > 0 { + + // Add rel="nofollow" if a "href" exists + switch elementName { + case "a", "area", "link": + var hrefFound bool + var externalLink bool + for _, htmlAttr := range cleanAttrs { + if htmlAttr.Key == "href" { + hrefFound = true + + u, err := url.Parse(htmlAttr.Val) + if err != nil { + continue + } + if u.Host != "" { + externalLink = true + } + + continue + } + } + + if hrefFound { + var ( + noFollowFound bool + targetBlankFound bool + ) + + addNoFollow := (p.requireNoFollow || + externalLink && p.requireNoFollowFullyQualifiedLinks) + + addTargetBlank := (externalLink && + p.addTargetBlankToFullyQualifiedLinks) + + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + + var appended bool + if htmlAttr.Key == "rel" && addNoFollow { + + if strings.Contains(htmlAttr.Val, "nofollow") { + noFollowFound = true + tmpAttrs = append(tmpAttrs, htmlAttr) + appended = true + } else { + htmlAttr.Val += " nofollow" + noFollowFound = true + tmpAttrs = append(tmpAttrs, htmlAttr) + appended = true + } + } + + if elementName == "a" && htmlAttr.Key == "target" { + if htmlAttr.Val == "_blank" { + targetBlankFound = true + } + if addTargetBlank && !targetBlankFound { + htmlAttr.Val = "_blank" + targetBlankFound = true + tmpAttrs = append(tmpAttrs, htmlAttr) + appended = true + } + } + + if !appended { + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + if noFollowFound || targetBlankFound { + cleanAttrs = tmpAttrs + } + + if addNoFollow && !noFollowFound { + rel := html.Attribute{} + rel.Key = "rel" + rel.Val = "nofollow" + cleanAttrs = append(cleanAttrs, rel) + } + + if elementName == "a" && addTargetBlank && !targetBlankFound { + rel := html.Attribute{} + rel.Key = "target" + rel.Val = "_blank" + targetBlankFound = true + cleanAttrs = append(cleanAttrs, rel) + } + + if targetBlankFound { + // target="_blank" has a security risk that allows the + // opened window/tab to issue JavaScript calls against + // window.opener, which in effect allow the destination + // of the link to control the source: + // https://dev.to/ben/the-targetblank-vulnerability-by-example + // + // To mitigate this risk, we need to add a specific rel + // attribute if it is not already present. + // rel="noopener" + // + // Unfortunately this is processing the rel twice (we + // already looked at it earlier ^^) as we cannot be sure + // of the ordering of the href and rel, and whether we + // have fully satisfied that we need to do this. This + // double processing only happens *if* target="_blank" + // is true. + var noOpenerAdded bool + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + var appended bool + if htmlAttr.Key == "rel" { + if strings.Contains(htmlAttr.Val, "noopener") { + noOpenerAdded = true + tmpAttrs = append(tmpAttrs, htmlAttr) + } else { + htmlAttr.Val += " noopener" + noOpenerAdded = true + tmpAttrs = append(tmpAttrs, htmlAttr) + } + + appended = true + } + if !appended { + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + if noOpenerAdded { + cleanAttrs = tmpAttrs + } else { + // rel attr was not found, or else noopener would + // have been added already + rel := html.Attribute{} + rel.Key = "rel" + rel.Val = "noopener" + cleanAttrs = append(cleanAttrs, rel) + } + + } + } + default: + } + } + } + + return cleanAttrs +} + +func (p *Policy) allowNoAttrs(elementName string) bool { + _, ok := p.setOfElementsAllowedWithoutAttrs[elementName] + return ok +} + +func (p *Policy) validURL(rawurl string) (string, bool) { + if p.requireParseableURLs { + // URLs do not contain whitespace + if strings.Contains(rawurl, " ") || + strings.Contains(rawurl, "\t") || + strings.Contains(rawurl, "\n") { + return "", false + } + + u, err := url.Parse(rawurl) + if err != nil { + return "", false + } + + if u.Scheme != "" { + + urlPolicy, ok := p.allowURLSchemes[u.Scheme] + if !ok { + return "", false + + } + + if urlPolicy == nil || urlPolicy(u) == true { + return u.String(), true + } + + return "", false + } + + if p.allowRelativeURLs { + if u.String() != "" { + return u.String(), true + } + } + + return "", false + } + + return rawurl, true +} + +func linkable(elementName string) bool { + switch elementName { + case "a", "area", "blockquote", "img", "link", "script": + return true + default: + return false + } +} |