diff options
Diffstat (limited to 'modules/markup/html.go')
-rw-r--r-- | modules/markup/html.go | 517 |
1 files changed, 517 insertions, 0 deletions
diff --git a/modules/markup/html.go b/modules/markup/html.go new file mode 100644 index 0000000000..9daf0b0c6c --- /dev/null +++ b/modules/markup/html.go @@ -0,0 +1,517 @@ +// Copyright 2017 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package markup + +import ( + "bytes" + "fmt" + "io" + "net/url" + "path" + "path/filepath" + "regexp" + "strings" + + "code.gitea.io/gitea/modules/base" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + + "github.com/Unknwon/com" + "golang.org/x/net/html" +) + +// Issue name styles +const ( + IssueNameStyleNumeric = "numeric" + IssueNameStyleAlphanumeric = "alphanumeric" +) + +var ( + // NOTE: All below regex matching do not perform any extra validation. + // Thus a link is produced even if the linked entity does not exist. + // While fast, this is also incorrect and lead to false positives. + // TODO: fix invalid linking issue + + // MentionPattern matches string that mentions someone, e.g. @Unknwon + MentionPattern = regexp.MustCompile(`(\s|^|\W)@[0-9a-zA-Z-_\.]+`) + + // IssueNumericPattern matches string that references to a numeric issue, e.g. #1287 + IssueNumericPattern = regexp.MustCompile(`( |^|\()#[0-9]+\b`) + // IssueAlphanumericPattern matches string that references to an alphanumeric issue, e.g. ABC-1234 + IssueAlphanumericPattern = regexp.MustCompile(`( |^|\()[A-Z]{1,10}-[1-9][0-9]*\b`) + // CrossReferenceIssueNumericPattern matches string that references a numeric issue in a different repository + // e.g. gogits/gogs#12345 + CrossReferenceIssueNumericPattern = regexp.MustCompile(`( |^)[0-9a-zA-Z]+/[0-9a-zA-Z]+#[0-9]+\b`) + + // Sha1CurrentPattern matches string that represents a commit SHA, e.g. d8a994ef243349f321568f9e36d5c3f444b99cae + // Although SHA1 hashes are 40 chars long, the regex matches the hash from 7 to 40 chars in length + // so that abbreviated hash links can be used as well. This matches git and github useability. + Sha1CurrentPattern = regexp.MustCompile(`(?:^|\s|\()([0-9a-f]{7,40})\b`) + + // ShortLinkPattern matches short but difficult to parse [[name|link|arg=test]] syntax + ShortLinkPattern = regexp.MustCompile(`(\[\[.*?\]\]\w*)`) + + // AnySHA1Pattern allows to split url containing SHA into parts + AnySHA1Pattern = regexp.MustCompile(`(http\S*)://(\S+)/(\S+)/(\S+)/(\S+)/([0-9a-f]{40})(?:/?([^#\s]+)?(?:#(\S+))?)?`) + + validLinksPattern = regexp.MustCompile(`^[a-z][\w-]+://`) +) + +// regexp for full links to issues/pulls +var issueFullPattern *regexp.Regexp + +// IsLink reports whether link fits valid format. +func IsLink(link []byte) bool { + return isLink(link) +} + +// isLink reports whether link fits valid format. +func isLink(link []byte) bool { + return validLinksPattern.Match(link) +} + +func getIssueFullPattern() *regexp.Regexp { + if issueFullPattern == nil { + appURL := setting.AppURL + if len(appURL) > 0 && appURL[len(appURL)-1] != '/' { + appURL += "/" + } + issueFullPattern = regexp.MustCompile(appURL + + `\w+/\w+/(?:issues|pulls)/((?:\w{1,10}-)?[1-9][0-9]*)([\?|#]\S+.(\S+)?)?\b`) + } + return issueFullPattern +} + +// FindAllMentions matches mention patterns in given content +// and returns a list of found user names without @ prefix. +func FindAllMentions(content string) []string { + mentions := MentionPattern.FindAllString(content, -1) + for i := range mentions { + mentions[i] = mentions[i][strings.Index(mentions[i], "@")+1:] // Strip @ character + } + return mentions +} + +// cutoutVerbosePrefix cutouts URL prefix including sub-path to +// return a clean unified string of request URL path. +func cutoutVerbosePrefix(prefix string) string { + if len(prefix) == 0 || prefix[0] != '/' { + return prefix + } + count := 0 + for i := 0; i < len(prefix); i++ { + if prefix[i] == '/' { + count++ + } + if count >= 3+setting.AppSubURLDepth { + return prefix[:i] + } + } + return prefix +} + +// URLJoin joins url components, like path.Join, but preserving contents +func URLJoin(base string, elems ...string) string { + u, err := url.Parse(base) + if err != nil { + log.Error(4, "URLJoin: Invalid base URL %s", base) + return "" + } + joinArgs := make([]string, 0, len(elems)+1) + joinArgs = append(joinArgs, u.Path) + joinArgs = append(joinArgs, elems...) + u.Path = path.Join(joinArgs...) + return u.String() +} + +// RenderIssueIndexPattern renders issue indexes to corresponding links. +func RenderIssueIndexPattern(rawBytes []byte, urlPrefix string, metas map[string]string) []byte { + urlPrefix = cutoutVerbosePrefix(urlPrefix) + + pattern := IssueNumericPattern + if metas["style"] == IssueNameStyleAlphanumeric { + pattern = IssueAlphanumericPattern + } + + ms := pattern.FindAll(rawBytes, -1) + for _, m := range ms { + if m[0] == ' ' || m[0] == '(' { + m = m[1:] // ignore leading space or opening parentheses + } + var link string + if metas == nil { + link = fmt.Sprintf(`<a href="%s">%s</a>`, URLJoin(urlPrefix, "issues", string(m[1:])), m) + } else { + // Support for external issue tracker + if metas["style"] == IssueNameStyleAlphanumeric { + metas["index"] = string(m) + } else { + metas["index"] = string(m[1:]) + } + link = fmt.Sprintf(`<a href="%s">%s</a>`, com.Expand(metas["format"], metas), m) + } + rawBytes = bytes.Replace(rawBytes, m, []byte(link), 1) + } + return rawBytes +} + +// IsSameDomain checks if given url string has the same hostname as current Gitea instance +func IsSameDomain(s string) bool { + if strings.HasPrefix(s, "/") { + return true + } + if uapp, err := url.Parse(setting.AppURL); err == nil { + if u, err := url.Parse(s); err == nil { + return u.Host == uapp.Host + } + return false + } + return false +} + +// renderFullSha1Pattern renders SHA containing URLs +func renderFullSha1Pattern(rawBytes []byte, urlPrefix string) []byte { + ms := AnySHA1Pattern.FindAllSubmatch(rawBytes, -1) + for _, m := range ms { + all := m[0] + protocol := string(m[1]) + paths := string(m[2]) + path := protocol + "://" + paths + author := string(m[3]) + repoName := string(m[4]) + path = URLJoin(path, author, repoName) + ltype := "src" + itemType := m[5] + if IsSameDomain(paths) { + ltype = string(itemType) + } else if string(itemType) == "commit" { + ltype = "commit" + } + sha := m[6] + var subtree string + if len(m) > 7 && len(m[7]) > 0 { + subtree = string(m[7]) + } + var line []byte + if len(m) > 8 && len(m[8]) > 0 { + line = m[8] + } + urlSuffix := "" + text := base.ShortSha(string(sha)) + if subtree != "" { + urlSuffix = "/" + subtree + text += urlSuffix + } + if line != nil { + value := string(line) + urlSuffix += "#" + urlSuffix += value + text += " (" + text += value + text += ")" + } + rawBytes = bytes.Replace(rawBytes, all, []byte(fmt.Sprintf( + `<a href="%s">%s</a>`, URLJoin(path, ltype, string(sha))+urlSuffix, text)), -1) + } + return rawBytes +} + +// RenderFullIssuePattern renders issues-like URLs +func RenderFullIssuePattern(rawBytes []byte) []byte { + ms := getIssueFullPattern().FindAllSubmatch(rawBytes, -1) + for _, m := range ms { + all := m[0] + id := string(m[1]) + text := "#" + id + // TODO if m[2] is not nil, then link is to a comment, + // and we should indicate that in the text somehow + rawBytes = bytes.Replace(rawBytes, all, []byte(fmt.Sprintf( + `<a href="%s">%s</a>`, string(all), text)), -1) + } + return rawBytes +} + +func firstIndexOfByte(sl []byte, target byte) int { + for i := 0; i < len(sl); i++ { + if sl[i] == target { + return i + } + } + return -1 +} + +func lastIndexOfByte(sl []byte, target byte) int { + for i := len(sl) - 1; i >= 0; i-- { + if sl[i] == target { + return i + } + } + return -1 +} + +// RenderShortLinks processes [[syntax]] +// +// noLink flag disables making link tags when set to true +// so this function just replaces the whole [[...]] with the content text +// +// isWikiMarkdown is a flag to choose linking url prefix +func RenderShortLinks(rawBytes []byte, urlPrefix string, noLink bool, isWikiMarkdown bool) []byte { + ms := ShortLinkPattern.FindAll(rawBytes, -1) + for _, m := range ms { + orig := bytes.TrimSpace(m) + m = orig[2:] + tailPos := lastIndexOfByte(m, ']') + 1 + tail := []byte{} + if tailPos < len(m) { + tail = m[tailPos:] + m = m[:tailPos-1] + } + m = m[:len(m)-2] + props := map[string]string{} + + // MediaWiki uses [[link|text]], while GitHub uses [[text|link]] + // It makes page handling terrible, but we prefer GitHub syntax + // And fall back to MediaWiki only when it is obvious from the look + // Of text and link contents + sl := bytes.Split(m, []byte("|")) + for _, v := range sl { + switch bytes.Count(v, []byte("=")) { + + // Piped args without = sign, these are mandatory arguments + case 0: + { + sv := string(v) + if props["name"] == "" { + if isLink(v) { + // If we clearly see it is a link, we save it so + + // But first we need to ensure, that if both mandatory args provided + // look like links, we stick to GitHub syntax + if props["link"] != "" { + props["name"] = props["link"] + } + + props["link"] = strings.TrimSpace(sv) + } else { + props["name"] = sv + } + } else { + props["link"] = strings.TrimSpace(sv) + } + } + + // Piped args with = sign, these are optional arguments + case 1: + { + sep := firstIndexOfByte(v, '=') + key, val := string(v[:sep]), html.UnescapeString(string(v[sep+1:])) + lastCharIndex := len(val) - 1 + if (val[0] == '"' || val[0] == '\'') && (val[lastCharIndex] == '"' || val[lastCharIndex] == '\'') { + val = val[1:lastCharIndex] + } + props[key] = val + } + } + } + + var name string + var link string + if props["link"] != "" { + link = props["link"] + } else if props["name"] != "" { + link = props["name"] + } + if props["title"] != "" { + name = props["title"] + } else if props["name"] != "" { + name = props["name"] + } else { + name = link + } + + name += string(tail) + image := false + ext := filepath.Ext(string(link)) + if ext != "" { + switch ext { + case ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".webp", ".gif", ".bmp", ".ico", ".svg": + { + image = true + } + } + } + absoluteLink := isLink([]byte(link)) + if !absoluteLink { + link = strings.Replace(link, " ", "+", -1) + } + if image { + if !absoluteLink { + if IsSameDomain(urlPrefix) { + urlPrefix = strings.Replace(urlPrefix, "/src/", "/raw/", 1) + } + if isWikiMarkdown { + link = URLJoin("wiki", "raw", link) + } + link = URLJoin(urlPrefix, link) + } + title := props["title"] + if title == "" { + title = props["alt"] + } + if title == "" { + title = path.Base(string(name)) + } + alt := props["alt"] + if alt == "" { + alt = name + } + if alt != "" { + alt = `alt="` + alt + `"` + } + name = fmt.Sprintf(`<img src="%s" %s title="%s" />`, link, alt, title) + } else if !absoluteLink { + if isWikiMarkdown { + link = URLJoin("wiki", link) + } + link = URLJoin(urlPrefix, link) + } + if noLink { + rawBytes = bytes.Replace(rawBytes, orig, []byte(name), -1) + } else { + rawBytes = bytes.Replace(rawBytes, orig, + []byte(fmt.Sprintf(`<a href="%s">%s</a>`, link, name)), -1) + } + } + return rawBytes +} + +// RenderCrossReferenceIssueIndexPattern renders issue indexes from other repositories to corresponding links. +func RenderCrossReferenceIssueIndexPattern(rawBytes []byte, urlPrefix string, metas map[string]string) []byte { + ms := CrossReferenceIssueNumericPattern.FindAll(rawBytes, -1) + for _, m := range ms { + if m[0] == ' ' || m[0] == '(' { + m = m[1:] // ignore leading space or opening parentheses + } + + repo := string(bytes.Split(m, []byte("#"))[0]) + issue := string(bytes.Split(m, []byte("#"))[1]) + + link := fmt.Sprintf(`<a href="%s">%s</a>`, URLJoin(setting.AppURL, repo, "issues", issue), m) + rawBytes = bytes.Replace(rawBytes, m, []byte(link), 1) + } + return rawBytes +} + +// renderSha1CurrentPattern renders SHA1 strings to corresponding links that assumes in the same repository. +func renderSha1CurrentPattern(rawBytes []byte, urlPrefix string) []byte { + ms := Sha1CurrentPattern.FindAllSubmatch(rawBytes, -1) + for _, m := range ms { + hash := m[1] + // The regex does not lie, it matches the hash pattern. + // However, a regex cannot know if a hash actually exists or not. + // We could assume that a SHA1 hash should probably contain alphas AND numerics + // but that is not always the case. + // Although unlikely, deadbeef and 1234567 are valid short forms of SHA1 hash + // as used by git and github for linking and thus we have to do similar. + rawBytes = bytes.Replace(rawBytes, hash, []byte(fmt.Sprintf( + `<a href="%s">%s</a>`, URLJoin(urlPrefix, "commit", string(hash)), base.ShortSha(string(hash)))), -1) + } + return rawBytes +} + +// RenderSpecialLink renders mentions, indexes and SHA1 strings to corresponding links. +func RenderSpecialLink(rawBytes []byte, urlPrefix string, metas map[string]string, isWikiMarkdown bool) []byte { + ms := MentionPattern.FindAll(rawBytes, -1) + for _, m := range ms { + m = m[bytes.Index(m, []byte("@")):] + rawBytes = bytes.Replace(rawBytes, m, + []byte(fmt.Sprintf(`<a href="%s">%s</a>`, URLJoin(setting.AppURL, string(m[1:])), m)), -1) + } + + rawBytes = RenderFullIssuePattern(rawBytes) + rawBytes = RenderShortLinks(rawBytes, urlPrefix, false, isWikiMarkdown) + rawBytes = RenderIssueIndexPattern(rawBytes, urlPrefix, metas) + rawBytes = RenderCrossReferenceIssueIndexPattern(rawBytes, urlPrefix, metas) + rawBytes = renderFullSha1Pattern(rawBytes, urlPrefix) + rawBytes = renderSha1CurrentPattern(rawBytes, urlPrefix) + return rawBytes +} + +var ( + leftAngleBracket = []byte("</") + rightAngleBracket = []byte(">") +) + +var noEndTags = []string{"img", "input", "br", "hr"} + +// PostProcess treats different types of HTML differently, +// and only renders special links for plain text blocks. +func PostProcess(rawHTML []byte, urlPrefix string, metas map[string]string, isWikiMarkdown bool) []byte { + startTags := make([]string, 0, 5) + var buf bytes.Buffer + tokenizer := html.NewTokenizer(bytes.NewReader(rawHTML)) + +OUTER_LOOP: + for html.ErrorToken != tokenizer.Next() { + token := tokenizer.Token() + switch token.Type { + case html.TextToken: + buf.Write(RenderSpecialLink([]byte(token.String()), urlPrefix, metas, isWikiMarkdown)) + + case html.StartTagToken: + buf.WriteString(token.String()) + tagName := token.Data + // If this is an excluded tag, we skip processing all output until a close tag is encountered. + if strings.EqualFold("a", tagName) || strings.EqualFold("code", tagName) || strings.EqualFold("pre", tagName) { + stackNum := 1 + for html.ErrorToken != tokenizer.Next() { + token = tokenizer.Token() + + // Copy the token to the output verbatim + buf.Write(RenderShortLinks([]byte(token.String()), urlPrefix, true, isWikiMarkdown)) + + if token.Type == html.StartTagToken && !com.IsSliceContainsStr(noEndTags, token.Data) { + stackNum++ + } + + // If this is the close tag to the outer-most, we are done + if token.Type == html.EndTagToken { + stackNum-- + + if stackNum <= 0 && strings.EqualFold(tagName, token.Data) { + break + } + } + } + continue OUTER_LOOP + } + + if !com.IsSliceContainsStr(noEndTags, tagName) { + startTags = append(startTags, tagName) + } + + case html.EndTagToken: + if len(startTags) == 0 { + buf.WriteString(token.String()) + break + } + + buf.Write(leftAngleBracket) + buf.WriteString(startTags[len(startTags)-1]) + buf.Write(rightAngleBracket) + startTags = startTags[:len(startTags)-1] + default: + buf.WriteString(token.String()) + } + } + + if io.EOF == tokenizer.Err() { + return buf.Bytes() + } + + // If we are not at the end of the input, then some other parsing error has occurred, + // so return the input verbatim. + return rawHTML +} |