diff options
author | zeripath <art27@cantab.net> | 2019-12-31 01:53:28 +0000 |
---|---|---|
committer | Lauris BH <lauris@nix.lv> | 2019-12-31 03:53:28 +0200 |
commit | 27757714d0420192e6139d1e4206446dcefe6531 (patch) | |
tree | d237deeb48315f1465dbd915b65f95973c4cbbe9 /modules/markup | |
parent | 0c07f1de5b8cf7b9f9f607aae76a53c99aeb2c04 (diff) | |
download | gitea-27757714d0420192e6139d1e4206446dcefe6531.tar.gz gitea-27757714d0420192e6139d1e4206446dcefe6531.zip |
Change markdown rendering from blackfriday to goldmark (#9533)
* Move to goldmark
Markdown rendering moved from blackfriday to the goldmark.
Multiple subtle changes required to the goldmark extensions to keep
current rendering and defaults.
Can go further with goldmark linkify and have this work within markdown
rendering making the link processor unnecessary.
Need to think about how to go about allowing extensions - at present it
seems that these would be hard to do without recompilation.
* linter fixes
Co-authored-by: Lauris BH <lauris@nix.lv>
Diffstat (limited to 'modules/markup')
-rw-r--r-- | modules/markup/common/footnote.go | 507 | ||||
-rw-r--r-- | modules/markup/common/html.go | 19 | ||||
-rw-r--r-- | modules/markup/common/linkify.go | 156 | ||||
-rw-r--r-- | modules/markup/html.go | 15 | ||||
-rw-r--r-- | modules/markup/html_test.go | 4 | ||||
-rw-r--r-- | modules/markup/markdown/goldmark.go | 178 | ||||
-rw-r--r-- | modules/markup/markdown/markdown.go | 207 | ||||
-rw-r--r-- | modules/markup/markdown/markdown_test.go | 40 | ||||
-rw-r--r-- | modules/markup/mdstripper/mdstripper.go | 177 | ||||
-rw-r--r-- | modules/markup/mdstripper/mdstripper_test.go | 14 | ||||
-rw-r--r-- | modules/markup/sanitizer.go | 8 |
11 files changed, 1064 insertions, 261 deletions
diff --git a/modules/markup/common/footnote.go b/modules/markup/common/footnote.go new file mode 100644 index 0000000000..ad4cd7f2e1 --- /dev/null +++ b/modules/markup/common/footnote.go @@ -0,0 +1,507 @@ +// Copyright 2019 Yusuke Inuzuka +// Copyright 2019 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Most of what follows is a subtly changed version of github.com/yuin/goldmark/extension/footnote.go + +package common + +import ( + "bytes" + "fmt" + "os" + "strconv" + "unicode" + + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer" + "github.com/yuin/goldmark/renderer/html" + "github.com/yuin/goldmark/text" + "github.com/yuin/goldmark/util" +) + +// CleanValue will clean a value to make it safe to be an id +// This function is quite different from the original goldmark function +// and more closely matches the output from the shurcooL sanitizer +// In particular Unicode letters and numbers are a lot more than a-zA-Z0-9... +func CleanValue(value []byte) []byte { + value = bytes.TrimSpace(value) + rs := bytes.Runes(value) + result := make([]rune, 0, len(rs)) + needsDash := false + for _, r := range rs { + switch { + case unicode.IsLetter(r) || unicode.IsNumber(r): + if needsDash && len(result) > 0 { + result = append(result, '-') + } + needsDash = false + result = append(result, unicode.ToLower(r)) + default: + needsDash = true + } + } + return []byte(string(result)) +} + +// Most of what follows is a subtly changed version of github.com/yuin/goldmark/extension/footnote.go + +// A FootnoteLink struct represents a link to a footnote of Markdown +// (PHP Markdown Extra) text. +type FootnoteLink struct { + ast.BaseInline + Index int + Name []byte +} + +// Dump implements Node.Dump. +func (n *FootnoteLink) Dump(source []byte, level int) { + m := map[string]string{} + m["Index"] = fmt.Sprintf("%v", n.Index) + m["Name"] = fmt.Sprintf("%v", n.Name) + ast.DumpHelper(n, source, level, m, nil) +} + +// KindFootnoteLink is a NodeKind of the FootnoteLink node. +var KindFootnoteLink = ast.NewNodeKind("GiteaFootnoteLink") + +// Kind implements Node.Kind. +func (n *FootnoteLink) Kind() ast.NodeKind { + return KindFootnoteLink +} + +// NewFootnoteLink returns a new FootnoteLink node. +func NewFootnoteLink(index int, name []byte) *FootnoteLink { + return &FootnoteLink{ + Index: index, + Name: name, + } +} + +// A FootnoteBackLink struct represents a link to a footnote of Markdown +// (PHP Markdown Extra) text. +type FootnoteBackLink struct { + ast.BaseInline + Index int + Name []byte +} + +// Dump implements Node.Dump. +func (n *FootnoteBackLink) Dump(source []byte, level int) { + m := map[string]string{} + m["Index"] = fmt.Sprintf("%v", n.Index) + m["Name"] = fmt.Sprintf("%v", n.Name) + ast.DumpHelper(n, source, level, m, nil) +} + +// KindFootnoteBackLink is a NodeKind of the FootnoteBackLink node. +var KindFootnoteBackLink = ast.NewNodeKind("GiteaFootnoteBackLink") + +// Kind implements Node.Kind. +func (n *FootnoteBackLink) Kind() ast.NodeKind { + return KindFootnoteBackLink +} + +// NewFootnoteBackLink returns a new FootnoteBackLink node. +func NewFootnoteBackLink(index int, name []byte) *FootnoteBackLink { + return &FootnoteBackLink{ + Index: index, + Name: name, + } +} + +// A Footnote struct represents a footnote of Markdown +// (PHP Markdown Extra) text. +type Footnote struct { + ast.BaseBlock + Ref []byte + Index int + Name []byte +} + +// Dump implements Node.Dump. +func (n *Footnote) Dump(source []byte, level int) { + m := map[string]string{} + m["Index"] = fmt.Sprintf("%v", n.Index) + m["Ref"] = fmt.Sprintf("%s", n.Ref) + m["Name"] = fmt.Sprintf("%v", n.Name) + ast.DumpHelper(n, source, level, m, nil) +} + +// KindFootnote is a NodeKind of the Footnote node. +var KindFootnote = ast.NewNodeKind("GiteaFootnote") + +// Kind implements Node.Kind. +func (n *Footnote) Kind() ast.NodeKind { + return KindFootnote +} + +// NewFootnote returns a new Footnote node. +func NewFootnote(ref []byte) *Footnote { + return &Footnote{ + Ref: ref, + Index: -1, + Name: ref, + } +} + +// A FootnoteList struct represents footnotes of Markdown +// (PHP Markdown Extra) text. +type FootnoteList struct { + ast.BaseBlock + Count int +} + +// Dump implements Node.Dump. +func (n *FootnoteList) Dump(source []byte, level int) { + m := map[string]string{} + m["Count"] = fmt.Sprintf("%v", n.Count) + ast.DumpHelper(n, source, level, m, nil) +} + +// KindFootnoteList is a NodeKind of the FootnoteList node. +var KindFootnoteList = ast.NewNodeKind("GiteaFootnoteList") + +// Kind implements Node.Kind. +func (n *FootnoteList) Kind() ast.NodeKind { + return KindFootnoteList +} + +// NewFootnoteList returns a new FootnoteList node. +func NewFootnoteList() *FootnoteList { + return &FootnoteList{ + Count: 0, + } +} + +var footnoteListKey = parser.NewContextKey() + +type footnoteBlockParser struct { +} + +var defaultFootnoteBlockParser = &footnoteBlockParser{} + +// NewFootnoteBlockParser returns a new parser.BlockParser that can parse +// footnotes of the Markdown(PHP Markdown Extra) text. +func NewFootnoteBlockParser() parser.BlockParser { + return defaultFootnoteBlockParser +} + +func (b *footnoteBlockParser) Trigger() []byte { + return []byte{'['} +} + +func (b *footnoteBlockParser) Open(parent ast.Node, reader text.Reader, pc parser.Context) (ast.Node, parser.State) { + line, segment := reader.PeekLine() + pos := pc.BlockOffset() + if pos < 0 || line[pos] != '[' { + return nil, parser.NoChildren + } + pos++ + if pos > len(line)-1 || line[pos] != '^' { + return nil, parser.NoChildren + } + open := pos + 1 + closes := 0 + closure := util.FindClosure(line[pos+1:], '[', ']', false, false) + closes = pos + 1 + closure + next := closes + 1 + if closure > -1 { + if next >= len(line) || line[next] != ':' { + return nil, parser.NoChildren + } + } else { + return nil, parser.NoChildren + } + padding := segment.Padding + label := reader.Value(text.NewSegment(segment.Start+open-padding, segment.Start+closes-padding)) + if util.IsBlank(label) { + return nil, parser.NoChildren + } + item := NewFootnote(label) + + pos = next + 1 - padding + if pos >= len(line) { + reader.Advance(pos) + return item, parser.NoChildren + } + reader.AdvanceAndSetPadding(pos, padding) + return item, parser.HasChildren +} + +func (b *footnoteBlockParser) Continue(node ast.Node, reader text.Reader, pc parser.Context) parser.State { + line, _ := reader.PeekLine() + if util.IsBlank(line) { + return parser.Continue | parser.HasChildren + } + childpos, padding := util.IndentPosition(line, reader.LineOffset(), 4) + if childpos < 0 { + return parser.Close + } + reader.AdvanceAndSetPadding(childpos, padding) + return parser.Continue | parser.HasChildren +} + +func (b *footnoteBlockParser) Close(node ast.Node, reader text.Reader, pc parser.Context) { + var list *FootnoteList + if tlist := pc.Get(footnoteListKey); tlist != nil { + list = tlist.(*FootnoteList) + } else { + list = NewFootnoteList() + pc.Set(footnoteListKey, list) + node.Parent().InsertBefore(node.Parent(), node, list) + } + node.Parent().RemoveChild(node.Parent(), node) + list.AppendChild(list, node) +} + +func (b *footnoteBlockParser) CanInterruptParagraph() bool { + return true +} + +func (b *footnoteBlockParser) CanAcceptIndentedLine() bool { + return false +} + +type footnoteParser struct { +} + +var defaultFootnoteParser = &footnoteParser{} + +// NewFootnoteParser returns a new parser.InlineParser that can parse +// footnote links of the Markdown(PHP Markdown Extra) text. +func NewFootnoteParser() parser.InlineParser { + return defaultFootnoteParser +} + +func (s *footnoteParser) Trigger() []byte { + // footnote syntax probably conflict with the image syntax. + // So we need trigger this parser with '!'. + return []byte{'!', '['} +} + +func (s *footnoteParser) Parse(parent ast.Node, block text.Reader, pc parser.Context) ast.Node { + line, segment := block.PeekLine() + pos := 1 + if len(line) > 0 && line[0] == '!' { + pos++ + } + if pos >= len(line) || line[pos] != '^' { + return nil + } + pos++ + if pos >= len(line) { + return nil + } + open := pos + closure := util.FindClosure(line[pos:], '[', ']', false, false) + if closure < 0 { + return nil + } + closes := pos + closure + value := block.Value(text.NewSegment(segment.Start+open, segment.Start+closes)) + block.Advance(closes + 1) + + var list *FootnoteList + if tlist := pc.Get(footnoteListKey); tlist != nil { + list = tlist.(*FootnoteList) + } + if list == nil { + return nil + } + index := 0 + name := []byte{} + for def := list.FirstChild(); def != nil; def = def.NextSibling() { + d := def.(*Footnote) + if bytes.Equal(d.Ref, value) { + if d.Index < 0 { + list.Count++ + d.Index = list.Count + val := CleanValue(d.Name) + if len(val) == 0 { + val = []byte(strconv.Itoa(d.Index)) + } + d.Name = pc.IDs().Generate(val, KindFootnote) + } + index = d.Index + name = d.Name + break + } + } + if index == 0 { + return nil + } + + return NewFootnoteLink(index, name) +} + +type footnoteASTTransformer struct { +} + +var defaultFootnoteASTTransformer = &footnoteASTTransformer{} + +// NewFootnoteASTTransformer returns a new parser.ASTTransformer that +// insert a footnote list to the last of the document. +func NewFootnoteASTTransformer() parser.ASTTransformer { + return defaultFootnoteASTTransformer +} + +func (a *footnoteASTTransformer) Transform(node *ast.Document, reader text.Reader, pc parser.Context) { + var list *FootnoteList + if tlist := pc.Get(footnoteListKey); tlist != nil { + list = tlist.(*FootnoteList) + } else { + return + } + pc.Set(footnoteListKey, nil) + for footnote := list.FirstChild(); footnote != nil; { + var container ast.Node = footnote + next := footnote.NextSibling() + if fc := container.LastChild(); fc != nil && ast.IsParagraph(fc) { + container = fc + } + footnoteNode := footnote.(*Footnote) + index := footnoteNode.Index + name := footnoteNode.Name + if index < 0 { + list.RemoveChild(list, footnote) + } else { + container.AppendChild(container, NewFootnoteBackLink(index, name)) + } + footnote = next + } + list.SortChildren(func(n1, n2 ast.Node) int { + if n1.(*Footnote).Index < n2.(*Footnote).Index { + return -1 + } + return 1 + }) + if list.Count <= 0 { + list.Parent().RemoveChild(list.Parent(), list) + return + } + + node.AppendChild(node, list) +} + +// FootnoteHTMLRenderer is a renderer.NodeRenderer implementation that +// renders FootnoteLink nodes. +type FootnoteHTMLRenderer struct { + html.Config +} + +// NewFootnoteHTMLRenderer returns a new FootnoteHTMLRenderer. +func NewFootnoteHTMLRenderer(opts ...html.Option) renderer.NodeRenderer { + r := &FootnoteHTMLRenderer{ + Config: html.NewConfig(), + } + for _, opt := range opts { + opt.SetHTMLOption(&r.Config) + } + return r +} + +// RegisterFuncs implements renderer.NodeRenderer.RegisterFuncs. +func (r *FootnoteHTMLRenderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) { + reg.Register(KindFootnoteLink, r.renderFootnoteLink) + reg.Register(KindFootnoteBackLink, r.renderFootnoteBackLink) + reg.Register(KindFootnote, r.renderFootnote) + reg.Register(KindFootnoteList, r.renderFootnoteList) +} + +func (r *FootnoteHTMLRenderer) renderFootnoteLink(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + if entering { + n := node.(*FootnoteLink) + n.Dump(source, 0) + is := strconv.Itoa(n.Index) + _, _ = w.WriteString(`<sup id="fnref:`) + _, _ = w.Write(n.Name) + _, _ = w.WriteString(`"><a href="#fn:`) + _, _ = w.Write(n.Name) + _, _ = w.WriteString(`" class="footnote-ref" role="doc-noteref">`) + _, _ = w.WriteString(is) + _, _ = w.WriteString(`</a></sup>`) + } + return ast.WalkContinue, nil +} + +func (r *FootnoteHTMLRenderer) renderFootnoteBackLink(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + if entering { + n := node.(*FootnoteBackLink) + fmt.Fprintf(os.Stdout, "source:\n%s\n", string(n.Text(source))) + _, _ = w.WriteString(` <a href="#fnref:`) + _, _ = w.Write(n.Name) + _, _ = w.WriteString(`" class="footnote-backref" role="doc-backlink">`) + _, _ = w.WriteString("↩︎") + _, _ = w.WriteString(`</a>`) + } + return ast.WalkContinue, nil +} + +func (r *FootnoteHTMLRenderer) renderFootnote(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + n := node.(*Footnote) + if entering { + fmt.Fprintf(os.Stdout, "source:\n%s\n", string(n.Text(source))) + _, _ = w.WriteString(`<li id="fn:`) + _, _ = w.Write(n.Name) + _, _ = w.WriteString(`" role="doc-endnote"`) + if node.Attributes() != nil { + html.RenderAttributes(w, node, html.ListItemAttributeFilter) + } + _, _ = w.WriteString(">\n") + } else { + _, _ = w.WriteString("</li>\n") + } + return ast.WalkContinue, nil +} + +func (r *FootnoteHTMLRenderer) renderFootnoteList(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + tag := "div" + if entering { + _, _ = w.WriteString("<") + _, _ = w.WriteString(tag) + _, _ = w.WriteString(` class="footnotes" role="doc-endnotes"`) + if node.Attributes() != nil { + html.RenderAttributes(w, node, html.GlobalAttributeFilter) + } + _ = w.WriteByte('>') + if r.Config.XHTML { + _, _ = w.WriteString("\n<hr />\n") + } else { + _, _ = w.WriteString("\n<hr>\n") + } + _, _ = w.WriteString("<ol>\n") + } else { + _, _ = w.WriteString("</ol>\n") + _, _ = w.WriteString("</") + _, _ = w.WriteString(tag) + _, _ = w.WriteString(">\n") + } + return ast.WalkContinue, nil +} + +type footnoteExtension struct{} + +// FootnoteExtension represents the Gitea Footnote +var FootnoteExtension = &footnoteExtension{} + +// Extend extends the markdown converter with the Gitea Footnote parser +func (e *footnoteExtension) Extend(m goldmark.Markdown) { + m.Parser().AddOptions( + parser.WithBlockParsers( + util.Prioritized(NewFootnoteBlockParser(), 999), + ), + parser.WithInlineParsers( + util.Prioritized(NewFootnoteParser(), 101), + ), + parser.WithASTTransformers( + util.Prioritized(NewFootnoteASTTransformer(), 999), + ), + ) + m.Renderer().AddOptions(renderer.WithNodeRenderers( + util.Prioritized(NewFootnoteHTMLRenderer(), 500), + )) +} diff --git a/modules/markup/common/html.go b/modules/markup/common/html.go new file mode 100644 index 0000000000..3a47686f1e --- /dev/null +++ b/modules/markup/common/html.go @@ -0,0 +1,19 @@ +// Copyright 2019 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package common + +import ( + "mvdan.cc/xurls/v2" +) + +var ( + // NOTE: All below regex matching do not perform any extra validation. + // Thus a link is produced even if the linked entity does not exist. + // While fast, this is also incorrect and lead to false positives. + // TODO: fix invalid linking issue + + // LinkRegex is a regexp matching a valid link + LinkRegex, _ = xurls.StrictMatchingScheme("https?://") +) diff --git a/modules/markup/common/linkify.go b/modules/markup/common/linkify.go new file mode 100644 index 0000000000..6ae70fba34 --- /dev/null +++ b/modules/markup/common/linkify.go @@ -0,0 +1,156 @@ +// Copyright 2019 Yusuke Inuzuka +// Copyright 2019 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Most of this file is a subtly changed version of github.com/yuin/goldmark/extension/linkify.go + +package common + +import ( + "bytes" + "regexp" + + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/text" + "github.com/yuin/goldmark/util" +) + +var wwwURLRegxp = regexp.MustCompile(`^www\.[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}((?:/|[#?])[-a-zA-Z0-9@:%_\+.~#!?&//=\(\);,'">\^{}\[\]` + "`" + `]*)?`) + +type linkifyParser struct { +} + +var defaultLinkifyParser = &linkifyParser{} + +// NewLinkifyParser return a new InlineParser can parse +// text that seems like a URL. +func NewLinkifyParser() parser.InlineParser { + return defaultLinkifyParser +} + +func (s *linkifyParser) Trigger() []byte { + // ' ' indicates any white spaces and a line head + return []byte{' ', '*', '_', '~', '('} +} + +var protoHTTP = []byte("http:") +var protoHTTPS = []byte("https:") +var protoFTP = []byte("ftp:") +var domainWWW = []byte("www.") + +func (s *linkifyParser) Parse(parent ast.Node, block text.Reader, pc parser.Context) ast.Node { + if pc.IsInLinkLabel() { + return nil + } + line, segment := block.PeekLine() + consumes := 0 + start := segment.Start + c := line[0] + // advance if current position is not a line head. + if c == ' ' || c == '*' || c == '_' || c == '~' || c == '(' { + consumes++ + start++ + line = line[1:] + } + + var m []int + var protocol []byte + var typ ast.AutoLinkType = ast.AutoLinkURL + if bytes.HasPrefix(line, protoHTTP) || bytes.HasPrefix(line, protoHTTPS) || bytes.HasPrefix(line, protoFTP) { + m = LinkRegex.FindSubmatchIndex(line) + } + if m == nil && bytes.HasPrefix(line, domainWWW) { + m = wwwURLRegxp.FindSubmatchIndex(line) + protocol = []byte("http") + } + if m != nil { + lastChar := line[m[1]-1] + if lastChar == '.' { + m[1]-- + } else if lastChar == ')' { + closing := 0 + for i := m[1] - 1; i >= m[0]; i-- { + if line[i] == ')' { + closing++ + } else if line[i] == '(' { + closing-- + } + } + if closing > 0 { + m[1] -= closing + } + } else if lastChar == ';' { + i := m[1] - 2 + for ; i >= m[0]; i-- { + if util.IsAlphaNumeric(line[i]) { + continue + } + break + } + if i != m[1]-2 { + if line[i] == '&' { + m[1] -= m[1] - i + } + } + } + } + if m == nil { + if len(line) > 0 && util.IsPunct(line[0]) { + return nil + } + typ = ast.AutoLinkEmail + stop := util.FindEmailIndex(line) + if stop < 0 { + return nil + } + at := bytes.IndexByte(line, '@') + m = []int{0, stop, at, stop - 1} + if m == nil || bytes.IndexByte(line[m[2]:m[3]], '.') < 0 { + return nil + } + lastChar := line[m[1]-1] + if lastChar == '.' { + m[1]-- + } + if m[1] < len(line) { + nextChar := line[m[1]] + if nextChar == '-' || nextChar == '_' { + return nil + } + } + } + if m == nil { + return nil + } + if consumes != 0 { + s := segment.WithStop(segment.Start + 1) + ast.MergeOrAppendTextSegment(parent, s) + } + consumes += m[1] + block.Advance(consumes) + n := ast.NewTextSegment(text.NewSegment(start, start+m[1])) + link := ast.NewAutoLink(typ, n) + link.Protocol = protocol + return link +} + +func (s *linkifyParser) CloseBlock(parent ast.Node, pc parser.Context) { + // nothing to do +} + +type linkify struct { +} + +// Linkify is an extension that allow you to parse text that seems like a URL. +var Linkify = &linkify{} + +func (e *linkify) Extend(m goldmark.Markdown) { + m.Parser().AddOptions( + parser.WithInlineParsers( + util.Prioritized(NewLinkifyParser(), 999), + ), + ) +} diff --git a/modules/markup/html.go b/modules/markup/html.go index b10da40fc1..2c6773bce4 100644 --- a/modules/markup/html.go +++ b/modules/markup/html.go @@ -15,6 +15,7 @@ import ( "code.gitea.io/gitea/modules/base" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/markup/common" "code.gitea.io/gitea/modules/references" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" @@ -57,8 +58,6 @@ var ( // https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail) emailRegex = regexp.MustCompile("(?:\\s|^|\\(|\\[)([a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9]{2,}(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)(?:\\s|$|\\)|\\]|\\.(\\s|$))") - linkRegex, _ = xurls.StrictMatchingScheme("https?://") - // blackfriday extensions create IDs like fn:user-content-footnote blackfridayExtRegex = regexp.MustCompile(`[^:]*:user-content-`) ) @@ -118,7 +117,7 @@ func CustomLinkURLSchemes(schemes []string) { } withAuth = append(withAuth, s) } - linkRegex, _ = xurls.StrictMatchingScheme(strings.Join(withAuth, "|")) + common.LinkRegex, _ = xurls.StrictMatchingScheme(strings.Join(withAuth, "|")) } // IsSameDomain checks if given url string has the same hostname as current Gitea instance @@ -509,6 +508,12 @@ func shortLinkProcessorFull(ctx *postProcessCtx, node *html.Node, noLink bool) { (strings.HasPrefix(val, "‘") && strings.HasSuffix(val, "’")) { const lenQuote = len("‘") val = val[lenQuote : len(val)-lenQuote] + } else if (strings.HasPrefix(val, "\"") && strings.HasSuffix(val, "\"")) || + (strings.HasPrefix(val, "'") && strings.HasSuffix(val, "'")) { + val = val[1 : len(val)-1] + } else if strings.HasPrefix(val, "'") && strings.HasSuffix(val, "’") { + const lenQuote = len("‘") + val = val[1 : len(val)-lenQuote] } props[key] = val } @@ -803,7 +808,7 @@ func emailAddressProcessor(ctx *postProcessCtx, node *html.Node) { // linkProcessor creates links for any HTTP or HTTPS URL not captured by // markdown. func linkProcessor(ctx *postProcessCtx, node *html.Node) { - m := linkRegex.FindStringIndex(node.Data) + m := common.LinkRegex.FindStringIndex(node.Data) if m == nil { return } @@ -832,7 +837,7 @@ func genDefaultLinkProcessor(defaultLink string) processor { // descriptionLinkProcessor creates links for DescriptionHTML func descriptionLinkProcessor(ctx *postProcessCtx, node *html.Node) { - m := linkRegex.FindStringIndex(node.Data) + m := common.LinkRegex.FindStringIndex(node.Data) if m == nil { return } diff --git a/modules/markup/html_test.go b/modules/markup/html_test.go index 07747e97e1..91ef320b40 100644 --- a/modules/markup/html_test.go +++ b/modules/markup/html_test.go @@ -323,6 +323,6 @@ func TestRender_ShortLinks(t *testing.T) { `<p><a href="`+notencodedImgurlWiki+`" rel="nofollow"><img src="`+notencodedImgurlWiki+`"/></a></p>`) test( "<p><a href=\"https://example.org\">[[foobar]]</a></p>", - `<p></p><p><a href="https://example.org" rel="nofollow">[[foobar]]</a></p><p></p>`, - `<p></p><p><a href="https://example.org" rel="nofollow">[[foobar]]</a></p><p></p>`) + `<p><a href="https://example.org" rel="nofollow">[[foobar]]</a></p>`, + `<p><a href="https://example.org" rel="nofollow">[[foobar]]</a></p>`) } diff --git a/modules/markup/markdown/goldmark.go b/modules/markup/markdown/goldmark.go new file mode 100644 index 0000000000..2a2a9dce6a --- /dev/null +++ b/modules/markup/markdown/goldmark.go @@ -0,0 +1,178 @@ +// Copyright 2019 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package markdown + +import ( + "bytes" + "fmt" + "strings" + + "code.gitea.io/gitea/modules/markup" + "code.gitea.io/gitea/modules/markup/common" + giteautil "code.gitea.io/gitea/modules/util" + + "github.com/yuin/goldmark/ast" + east "github.com/yuin/goldmark/extension/ast" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer" + "github.com/yuin/goldmark/renderer/html" + "github.com/yuin/goldmark/text" + "github.com/yuin/goldmark/util" +) + +var byteMailto = []byte("mailto:") + +// GiteaASTTransformer is a default transformer of the goldmark tree. +type GiteaASTTransformer struct{} + +// Transform transforms the given AST tree. +func (g *GiteaASTTransformer) Transform(node *ast.Document, reader text.Reader, pc parser.Context) { + _ = ast.Walk(node, func(n ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + + switch v := n.(type) { + case *ast.Image: + // Images need two things: + // + // 1. Their src needs to munged to be a real value + // 2. If they're not wrapped with a link they need a link wrapper + + // Check if the destination is a real link + link := v.Destination + if len(link) > 0 && !markup.IsLink(link) { + prefix := pc.Get(urlPrefixKey).(string) + if pc.Get(isWikiKey).(bool) { + prefix = giteautil.URLJoin(prefix, "wiki", "raw") + } + prefix = strings.Replace(prefix, "/src/", "/media/", 1) + + lnk := string(link) + lnk = giteautil.URLJoin(prefix, lnk) + lnk = strings.Replace(lnk, " ", "+", -1) + link = []byte(lnk) + } + v.Destination = link + + parent := n.Parent() + // Create a link around image only if parent is not already a link + if _, ok := parent.(*ast.Link); !ok && parent != nil { + wrap := ast.NewLink() + wrap.Destination = link + wrap.Title = v.Title + parent.ReplaceChild(parent, n, wrap) + wrap.AppendChild(wrap, n) + } + case *ast.Link: + // Links need their href to munged to be a real value + link := v.Destination + if len(link) > 0 && !markup.IsLink(link) && + link[0] != '#' && !bytes.HasPrefix(link, byteMailto) { + // special case: this is not a link, a hash link or a mailto:, so it's a + // relative URL + lnk := string(link) + if pc.Get(isWikiKey).(bool) { + lnk = giteautil.URLJoin("wiki", lnk) + } + link = []byte(giteautil.URLJoin(pc.Get(urlPrefixKey).(string), lnk)) + } + v.Destination = link + } + return ast.WalkContinue, nil + }) +} + +type prefixedIDs struct { + values map[string]bool +} + +// Generate generates a new element id. +func (p *prefixedIDs) Generate(value []byte, kind ast.NodeKind) []byte { + dft := []byte("id") + if kind == ast.KindHeading { + dft = []byte("heading") + } + return p.GenerateWithDefault(value, dft) +} + +// Generate generates a new element id. +func (p *prefixedIDs) GenerateWithDefault(value []byte, dft []byte) []byte { + result := common.CleanValue(value) + if len(result) == 0 { + result = dft + } + if !bytes.HasPrefix(result, []byte("user-content-")) { + result = append([]byte("user-content-"), result...) + } + if _, ok := p.values[util.BytesToReadOnlyString(result)]; !ok { + p.values[util.BytesToReadOnlyString(result)] = true + return result + } + for i := 1; ; i++ { + newResult := fmt.Sprintf("%s-%d", result, i) + if _, ok := p.values[newResult]; !ok { + p.values[newResult] = true + return []byte(newResult) + } + } +} + +// Put puts a given element id to the used ids table. +func (p *prefixedIDs) Put(value []byte) { + p.values[util.BytesToReadOnlyString(value)] = true +} + +func newPrefixedIDs() *prefixedIDs { + return &prefixedIDs{ + values: map[string]bool{}, + } +} + +// NewTaskCheckBoxHTMLRenderer creates a TaskCheckBoxHTMLRenderer to render tasklists +// in the gitea form. +func NewTaskCheckBoxHTMLRenderer(opts ...html.Option) renderer.NodeRenderer { + r := &TaskCheckBoxHTMLRenderer{ + Config: html.NewConfig(), + } + for _, opt := range opts { + opt.SetHTMLOption(&r.Config) + } + return r +} + +// TaskCheckBoxHTMLRenderer is a renderer.NodeRenderer implementation that +// renders checkboxes in list items. +// Overrides the default goldmark one to present the gitea format +type TaskCheckBoxHTMLRenderer struct { + html.Config +} + +// RegisterFuncs implements renderer.NodeRenderer.RegisterFuncs. +func (r *TaskCheckBoxHTMLRenderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) { + reg.Register(east.KindTaskCheckBox, r.renderTaskCheckBox) +} + +func (r *TaskCheckBoxHTMLRenderer) renderTaskCheckBox(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + n := node.(*east.TaskCheckBox) + + end := ">" + if r.XHTML { + end = " />" + } + var err error + if n.IsChecked { + _, err = w.WriteString(`<span class="ui fitted disabled checkbox"><input type="checkbox" disabled="disabled"` + end + `<label` + end + `</span>`) + } else { + _, err = w.WriteString(`<span class="ui checked fitted disabled checkbox"><input type="checkbox" checked="" disabled="disabled"` + end + `<label` + end + `</span>`) + } + if err != nil { + return ast.WalkStop, err + } + return ast.WalkContinue, nil +} diff --git a/modules/markup/markdown/markdown.go b/modules/markup/markdown/markdown.go index f1e44a8fbc..5230fca4dc 100644 --- a/modules/markup/markdown/markdown.go +++ b/modules/markup/markdown/markdown.go @@ -7,161 +7,83 @@ package markdown import ( "bytes" - "io" - "strings" + "sync" + "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/markup" + "code.gitea.io/gitea/modules/markup/common" "code.gitea.io/gitea/modules/setting" - "code.gitea.io/gitea/modules/util" - - "github.com/russross/blackfriday/v2" + giteautil "code.gitea.io/gitea/modules/util" + + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/extension" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer" + "github.com/yuin/goldmark/renderer/html" + "github.com/yuin/goldmark/util" ) -// Renderer is a extended version of underlying render object. -type Renderer struct { - blackfriday.Renderer - URLPrefix string - IsWiki bool -} +var converter goldmark.Markdown +var once = sync.Once{} -var byteMailto = []byte("mailto:") +var urlPrefixKey = parser.NewContextKey() +var isWikiKey = parser.NewContextKey() -var htmlEscaper = [256][]byte{ - '&': []byte("&"), - '<': []byte("<"), - '>': []byte(">"), - '"': []byte("""), +// NewGiteaParseContext creates a parser.Context with the gitea context set +func NewGiteaParseContext(urlPrefix string, isWiki bool) parser.Context { + pc := parser.NewContext(parser.WithIDs(newPrefixedIDs())) + pc.Set(urlPrefixKey, urlPrefix) + pc.Set(isWikiKey, isWiki) + return pc } -func escapeHTML(w io.Writer, s []byte) { - var start, end int - for end < len(s) { - escSeq := htmlEscaper[s[end]] - if escSeq != nil { - _, _ = w.Write(s[start:end]) - _, _ = w.Write(escSeq) - start = end + 1 - } - end++ - } - if start < len(s) && end <= len(s) { - _, _ = w.Write(s[start:end]) - } -} - -// RenderNode is a default renderer of a single node of a syntax tree. For -// block nodes it will be called twice: first time with entering=true, second -// time with entering=false, so that it could know when it's working on an open -// tag and when on close. It writes the result to w. -// -// The return value is a way to tell the calling walker to adjust its walk -// pattern: e.g. it can terminate the traversal by returning Terminate. Or it -// can ask the walker to skip a subtree of this node by returning SkipChildren. -// The typical behavior is to return GoToNext, which asks for the usual -// traversal to the next node. -func (r *Renderer) RenderNode(w io.Writer, node *blackfriday.Node, entering bool) blackfriday.WalkStatus { - switch node.Type { - case blackfriday.Image: - prefix := r.URLPrefix - if r.IsWiki { - prefix = util.URLJoin(prefix, "wiki", "raw") - } - prefix = strings.Replace(prefix, "/src/", "/media/", 1) - link := node.LinkData.Destination - if len(link) > 0 && !markup.IsLink(link) { - lnk := string(link) - lnk = util.URLJoin(prefix, lnk) - lnk = strings.Replace(lnk, " ", "+", -1) - link = []byte(lnk) - } - node.LinkData.Destination = link - // Render link around image only if parent is not link already - if node.Parent != nil && node.Parent.Type != blackfriday.Link { - if entering { - _, _ = w.Write([]byte(`<a href="`)) - escapeHTML(w, link) - _, _ = w.Write([]byte(`">`)) - return r.Renderer.RenderNode(w, node, entering) - } - s := r.Renderer.RenderNode(w, node, entering) - _, _ = w.Write([]byte(`</a>`)) - return s - } - return r.Renderer.RenderNode(w, node, entering) - case blackfriday.Link: - // special case: this is not a link, a hash link or a mailto:, so it's a - // relative URL - link := node.LinkData.Destination - if len(link) > 0 && !markup.IsLink(link) && - link[0] != '#' && !bytes.HasPrefix(link, byteMailto) && - node.LinkData.Footnote == nil { - lnk := string(link) - if r.IsWiki { - lnk = util.URLJoin("wiki", lnk) - } - link = []byte(util.URLJoin(r.URLPrefix, lnk)) - } - node.LinkData.Destination = link - return r.Renderer.RenderNode(w, node, entering) - case blackfriday.Text: - isListItem := false - for n := node.Parent; n != nil; n = n.Parent { - if n.Type == blackfriday.Item { - isListItem = true - break - } - } - if isListItem { - text := node.Literal - switch { - case bytes.HasPrefix(text, []byte("[ ] ")): - _, _ = w.Write([]byte(`<span class="ui fitted disabled checkbox"><input type="checkbox" disabled="disabled" /><label /></span>`)) - text = text[3:] - case bytes.HasPrefix(text, []byte("[x] ")): - _, _ = w.Write([]byte(`<span class="ui checked fitted disabled checkbox"><input type="checkbox" checked="" disabled="disabled" /><label /></span>`)) - text = text[3:] - } - node.Literal = text - } - } - return r.Renderer.RenderNode(w, node, entering) -} - -const ( - blackfridayExtensions = 0 | - blackfriday.NoIntraEmphasis | - blackfriday.Tables | - blackfriday.FencedCode | - blackfriday.Strikethrough | - blackfriday.NoEmptyLineBeforeBlock | - blackfriday.DefinitionLists | - blackfriday.Footnotes | - blackfriday.HeadingIDs | - blackfriday.AutoHeadingIDs - blackfridayHTMLFlags = 0 | - blackfriday.Smartypants -) - // RenderRaw renders Markdown to HTML without handling special links. func RenderRaw(body []byte, urlPrefix string, wikiMarkdown bool) []byte { - renderer := &Renderer{ - Renderer: blackfriday.NewHTMLRenderer(blackfriday.HTMLRendererParameters{ - Flags: blackfridayHTMLFlags, - FootnoteAnchorPrefix: "user-content-", - HeadingIDPrefix: "user-content-", - }), - URLPrefix: urlPrefix, - IsWiki: wikiMarkdown, - } + once.Do(func() { + converter = goldmark.New( + goldmark.WithExtensions(extension.Table, + extension.Strikethrough, + extension.TaskList, + extension.DefinitionList, + common.FootnoteExtension, + extension.NewTypographer( + extension.WithTypographicSubstitutions(extension.TypographicSubstitutions{ + extension.EnDash: nil, + extension.EmDash: nil, + }), + ), + ), + goldmark.WithParserOptions( + parser.WithAttribute(), + parser.WithAutoHeadingID(), + parser.WithASTTransformers( + util.Prioritized(&GiteaASTTransformer{}, 10000), + ), + ), + goldmark.WithRendererOptions( + html.WithUnsafe(), + ), + ) + + // Override the original Tasklist renderer! + converter.Renderer().AddOptions( + renderer.WithNodeRenderers( + util.Prioritized(NewTaskCheckBoxHTMLRenderer(), 1000), + ), + ) + + if setting.Markdown.EnableHardLineBreak { + converter.Renderer().AddOptions(html.WithHardWraps()) + } + }) - exts := blackfridayExtensions - if setting.Markdown.EnableHardLineBreak { - exts |= blackfriday.HardLineBreak + pc := NewGiteaParseContext(urlPrefix, wikiMarkdown) + var buf bytes.Buffer + if err := converter.Convert(giteautil.NormalizeEOL(body), &buf, parser.WithContext(pc)); err != nil { + log.Error("Unable to render: %v", err) } - // Need to normalize EOL to UNIX LF to have consistent results in rendering - body = blackfriday.Run(util.NormalizeEOL(body), blackfriday.WithRenderer(renderer), blackfriday.WithExtensions(exts)) - return markup.SanitizeBytes(body) + return markup.SanitizeReader(&buf).Bytes() } var ( @@ -174,8 +96,7 @@ func init() { } // Parser implements markup.Parser -type Parser struct { -} +type Parser struct{} // Name implements markup.Parser func (Parser) Name() string { diff --git a/modules/markup/markdown/markdown_test.go b/modules/markup/markdown/markdown_test.go index e3156a657b..53772ee441 100644 --- a/modules/markup/markdown/markdown_test.go +++ b/modules/markup/markdown/markdown_test.go @@ -98,16 +98,12 @@ func TestRender_Images(t *testing.T) { func testAnswers(baseURLContent, baseURLImages string) []string { return []string{ `<p>Wiki! Enjoy :)</p> - <ul> <li><a href="` + baseURLContent + `/Links" rel="nofollow">Links, Language bindings, Engine bindings</a></li> <li><a href="` + baseURLContent + `/Tips" rel="nofollow">Tips</a></li> </ul> - <p>See commit <a href="http://localhost:3000/gogits/gogs/commit/65f1bf27bc" rel="nofollow"><code>65f1bf27bc</code></a></p> - <p>Ideas and codes</p> - <ul> <li>Bezier widget (by <a href="` + AppURL + `r-lyeh" rel="nofollow">@r-lyeh</a>) <a href="http://localhost:3000/ocornut/imgui/issues/786" rel="nofollow">ocornut/imgui#786</a></li> <li>Bezier widget (by <a href="` + AppURL + `r-lyeh" rel="nofollow">@r-lyeh</a>) <a href="http://localhost:3000/gogits/gogs/issues/786" rel="nofollow">#786</a></li> @@ -117,13 +113,9 @@ func testAnswers(baseURLContent, baseURLImages string) []string { </ul> `, `<h2 id="user-content-what-is-wine-staging">What is Wine Staging?</h2> - <p><strong>Wine Staging</strong> on website <a href="http://wine-staging.com" rel="nofollow">wine-staging.com</a>.</p> - <h2 id="user-content-quick-links">Quick Links</h2> - <p>Here are some links to the most important topics. You can find the full list of pages at the sidebar.</p> - <table> <thead> <tr> @@ -131,7 +123,6 @@ func testAnswers(baseURLContent, baseURLImages string) []string { <th><a href="` + baseURLContent + `/Installation" rel="nofollow">Installation</a></th> </tr> </thead> - <tbody> <tr> <td><a href="` + baseURLImages + `/images/icon-usage.png" rel="nofollow"><img src="` + baseURLImages + `/images/icon-usage.png" title="icon-usage.png" alt="images/icon-usage.png"/></a></td> @@ -141,20 +132,15 @@ func testAnswers(baseURLContent, baseURLImages string) []string { </table> `, `<p><a href="http://www.excelsiorjet.com/" rel="nofollow">Excelsior JET</a> allows you to create native executables for Windows, Linux and Mac OS X.</p> - <ol> <li><a href="https://github.com/libgdx/libgdx/wiki/Gradle-on-the-Commandline#packaging-for-the-desktop" rel="nofollow">Package your libGDX application</a> <a href="` + baseURLImages + `/images/1.png" rel="nofollow"><img src="` + baseURLImages + `/images/1.png" title="1.png" alt="images/1.png"/></a></li> <li>Perform a test run by hitting the Run! button. <a href="` + baseURLImages + `/images/2.png" rel="nofollow"><img src="` + baseURLImages + `/images/2.png" title="2.png" alt="images/2.png"/></a></li> </ol> - <h2 id="user-content-custom-id">More tests</h2> - <p>(from <a href="https://www.markdownguide.org/extended-syntax/" rel="nofollow">https://www.markdownguide.org/extended-syntax/</a>)</p> - <h3 id="user-content-definition-list">Definition list</h3> - <dl> <dt>First Term</dt> <dd>This is the definition of the first term.</dd> @@ -162,27 +148,21 @@ func testAnswers(baseURLContent, baseURLImages string) []string { <dd>This is one definition of the second term.</dd> <dd>This is another definition of the second term.</dd> </dl> - <h3 id="user-content-footnotes">Footnotes</h3> - <p>Here is a simple footnote,<sup id="fnref:user-content-1"><a href="#fn:user-content-1" rel="nofollow">1</a></sup> and here is a longer one.<sup id="fnref:user-content-bignote"><a href="#fn:user-content-bignote" rel="nofollow">2</a></sup></p> - <div> - <hr/> - <ol> -<li id="fn:user-content-1">This is the first footnote.</li> - -<li id="fn:user-content-bignote"><p>Here is one with multiple paragraphs and code.</p> - +<li id="fn:user-content-1"> +<p>This is the first footnote. <a href="#fnref:user-content-1" rel="nofollow">↩︎</a></p> +</li> +<li id="fn:user-content-bignote"> +<p>Here is one with multiple paragraphs and code.</p> <p>Indent paragraphs to include them in the footnote.</p> - <p><code>{ my code }</code></p> - -<p>Add as many paragraphs as you like.</p></li> +<p>Add as many paragraphs as you like. <a href="#fnref:user-content-bignote" rel="nofollow">↩︎</a></p> +</li> </ol> - </div> `, } @@ -299,15 +279,15 @@ func TestRender_RenderParagraphs(t *testing.T) { test := func(t *testing.T, str string, cnt int) { unix := []byte(str) res := string(RenderRaw(unix, "", false)) - assert.Equal(t, strings.Count(res, "<p"), cnt) + assert.Equal(t, strings.Count(res, "<p"), cnt, "Rendered result for unix should have %d paragraph(s) but has %d:\n%s\n", cnt, strings.Count(res, "<p"), res) mac := []byte(strings.ReplaceAll(str, "\n", "\r")) res = string(RenderRaw(mac, "", false)) - assert.Equal(t, strings.Count(res, "<p"), cnt) + assert.Equal(t, strings.Count(res, "<p"), cnt, "Rendered result for mac should have %d paragraph(s) but has %d:\n%s\n", cnt, strings.Count(res, "<p"), res) dos := []byte(strings.ReplaceAll(str, "\n", "\r\n")) res = string(RenderRaw(dos, "", false)) - assert.Equal(t, strings.Count(res, "<p"), cnt) + assert.Equal(t, strings.Count(res, "<p"), cnt, "Rendered result for windows should have %d paragraph(s) but has %d:\n%s\n", cnt, strings.Count(res, "<p"), res) } test(t, "\nOne\nTwo\nThree", 1) diff --git a/modules/markup/mdstripper/mdstripper.go b/modules/markup/mdstripper/mdstripper.go index d248944b68..9d05ee3969 100644 --- a/modules/markup/mdstripper/mdstripper.go +++ b/modules/markup/mdstripper/mdstripper.go @@ -6,113 +6,128 @@ package mdstripper import ( "bytes" - "io" + "sync" - "github.com/russross/blackfriday/v2" -) + "io" -// MarkdownStripper extends blackfriday.Renderer -type MarkdownStripper struct { - links []string - coallesce bool - empty bool -} + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/markup/common" -const ( - blackfridayExtensions = 0 | - blackfriday.NoIntraEmphasis | - blackfriday.Tables | - blackfriday.FencedCode | - blackfriday.Strikethrough | - blackfriday.NoEmptyLineBeforeBlock | - blackfriday.DefinitionLists | - blackfriday.Footnotes | - blackfriday.HeadingIDs | - blackfriday.AutoHeadingIDs | - // Not included in modules/markup/markdown/markdown.go; - // required here to process inline links - blackfriday.Autolink + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/extension" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer" + "github.com/yuin/goldmark/renderer/html" + "github.com/yuin/goldmark/text" ) -// StripMarkdown parses markdown content by removing all markup and code blocks -// in order to extract links and other references -func StripMarkdown(rawBytes []byte) (string, []string) { - buf, links := StripMarkdownBytes(rawBytes) - return string(buf), links +type stripRenderer struct { + links []string + empty bool } -// StripMarkdownBytes parses markdown content by removing all markup and code blocks -// in order to extract links and other references -func StripMarkdownBytes(rawBytes []byte) ([]byte, []string) { - stripper := &MarkdownStripper{ - links: make([]string, 0, 10), - empty: true, - } - - parser := blackfriday.New(blackfriday.WithRenderer(stripper), blackfriday.WithExtensions(blackfridayExtensions)) - ast := parser.Parse(rawBytes) - var buf bytes.Buffer - stripper.RenderHeader(&buf, ast) - ast.Walk(func(node *blackfriday.Node, entering bool) blackfriday.WalkStatus { - return stripper.RenderNode(&buf, node, entering) +func (r *stripRenderer) Render(w io.Writer, source []byte, doc ast.Node) error { + return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + switch v := n.(type) { + case *ast.Text: + if !v.IsRaw() { + _, prevSibIsText := n.PreviousSibling().(*ast.Text) + coalesce := prevSibIsText + r.processString( + w, + v.Text(source), + coalesce) + if v.SoftLineBreak() { + r.doubleSpace(w) + } + } + return ast.WalkContinue, nil + case *ast.Link: + r.processLink(w, v.Destination) + return ast.WalkSkipChildren, nil + case *ast.AutoLink: + r.processLink(w, v.URL(source)) + return ast.WalkSkipChildren, nil + } + return ast.WalkContinue, nil }) - stripper.RenderFooter(&buf, ast) - return buf.Bytes(), stripper.GetLinks() -} - -// RenderNode is the main rendering method. It will be called once for -// every leaf node and twice for every non-leaf node (first with -// entering=true, then with entering=false). The method should write its -// rendition of the node to the supplied writer w. -func (r *MarkdownStripper) RenderNode(w io.Writer, node *blackfriday.Node, entering bool) blackfriday.WalkStatus { - if !entering { - return blackfriday.GoToNext - } - switch node.Type { - case blackfriday.Text: - r.processString(w, node.Literal, node.Parent == nil) - return blackfriday.GoToNext - case blackfriday.Link: - r.processLink(w, node.LinkData.Destination) - r.coallesce = false - return blackfriday.SkipChildren - } - r.coallesce = false - return blackfriday.GoToNext -} - -// RenderHeader is a method that allows the renderer to produce some -// content preceding the main body of the output document. -func (r *MarkdownStripper) RenderHeader(w io.Writer, ast *blackfriday.Node) { } -// RenderFooter is a symmetric counterpart of RenderHeader. -func (r *MarkdownStripper) RenderFooter(w io.Writer, ast *blackfriday.Node) { -} - -func (r *MarkdownStripper) doubleSpace(w io.Writer) { +func (r *stripRenderer) doubleSpace(w io.Writer) { if !r.empty { _, _ = w.Write([]byte{'\n'}) } } -func (r *MarkdownStripper) processString(w io.Writer, text []byte, coallesce bool) { +func (r *stripRenderer) processString(w io.Writer, text []byte, coalesce bool) { // Always break-up words - if !coallesce || !r.coallesce { + if !coalesce { r.doubleSpace(w) } _, _ = w.Write(text) - r.coallesce = coallesce r.empty = false } -func (r *MarkdownStripper) processLink(w io.Writer, link []byte) { +func (r *stripRenderer) processLink(w io.Writer, link []byte) { // Links are processed out of band r.links = append(r.links, string(link)) - r.coallesce = false } // GetLinks returns the list of link data collected while parsing -func (r *MarkdownStripper) GetLinks() []string { +func (r *stripRenderer) GetLinks() []string { return r.links } + +// AddOptions adds given option to this renderer. +func (r *stripRenderer) AddOptions(...renderer.Option) { + // no-op +} + +// StripMarkdown parses markdown content by removing all markup and code blocks +// in order to extract links and other references +func StripMarkdown(rawBytes []byte) (string, []string) { + buf, links := StripMarkdownBytes(rawBytes) + return string(buf), links +} + +var stripParser parser.Parser +var once = sync.Once{} + +// StripMarkdownBytes parses markdown content by removing all markup and code blocks +// in order to extract links and other references +func StripMarkdownBytes(rawBytes []byte) ([]byte, []string) { + once.Do(func() { + gdMarkdown := goldmark.New( + goldmark.WithExtensions(extension.Table, + extension.Strikethrough, + extension.TaskList, + extension.DefinitionList, + common.FootnoteExtension, + common.Linkify, + ), + goldmark.WithParserOptions( + parser.WithAttribute(), + parser.WithAutoHeadingID(), + ), + goldmark.WithRendererOptions( + html.WithUnsafe(), + ), + ) + stripParser = gdMarkdown.Parser() + }) + stripper := &stripRenderer{ + links: make([]string, 0, 10), + empty: true, + } + reader := text.NewReader(rawBytes) + doc := stripParser.Parse(reader) + var buf bytes.Buffer + if err := stripper.Render(&buf, rawBytes, doc); err != nil { + log.Error("Unable to strip: %v", err) + } + return buf.Bytes(), stripper.GetLinks() +} diff --git a/modules/markup/mdstripper/mdstripper_test.go b/modules/markup/mdstripper/mdstripper_test.go index 157fe1975b..9efcc35949 100644 --- a/modules/markup/mdstripper/mdstripper_test.go +++ b/modules/markup/mdstripper/mdstripper_test.go @@ -53,6 +53,20 @@ A HIDDEN ` + "`" + `GHOST` + "`" + ` IN THIS LINE. []string{ "link", }}, + { + "Simply closes: #29 yes", + []string{ + "Simply closes: #29 yes", + }, + []string{}, + }, + { + "Simply closes: !29 yes", + []string{ + "Simply closes: !29 yes", + }, + []string{}, + }, } for _, test := range list { diff --git a/modules/markup/sanitizer.go b/modules/markup/sanitizer.go index f7789a9e56..d135d41966 100644 --- a/modules/markup/sanitizer.go +++ b/modules/markup/sanitizer.go @@ -6,6 +6,8 @@ package markup import ( + "bytes" + "io" "regexp" "sync" @@ -67,6 +69,12 @@ func Sanitize(s string) string { return sanitizer.policy.Sanitize(s) } +// SanitizeReader sanitizes a Reader +func SanitizeReader(r io.Reader) *bytes.Buffer { + NewSanitizer() + return sanitizer.policy.SanitizeReader(r) +} + // SanitizeBytes takes a []byte slice that contains a HTML fragment or document and applies policy whitelist. func SanitizeBytes(b []byte) []byte { if len(b) == 0 { |