diff options
author | KN4CK3R <KN4CK3R@users.noreply.github.com> | 2021-03-29 22:44:28 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-29 22:44:28 +0200 |
commit | 0c6137617fbf41ee6cb315f96a2acc2dd91203e8 (patch) | |
tree | 27c8d1304334f1783232166927093419079ecd2a /modules/markup | |
parent | d3b8127ad372bbce8d891d8893ffe6e834590751 (diff) | |
download | gitea-0c6137617fbf41ee6cb315f96a2acc2dd91203e8.tar.gz gitea-0c6137617fbf41ee6cb315f96a2acc2dd91203e8.zip |
Add Tabular Diff for CSV files (#14661)
Implements request #14320 The rendering of CSV files does match the diff style.
* Moved CSV logic into base package.
* Added method to create a tabular diff.
* Added CSV compare context.
* Added CSV diff template.
* Use new table style in CSV markup.
* Added file size limit for CSV rendering.
* Display CSV parser errors in diff.
* Lazy read single file.
* Lazy read rows for full diff.
* Added unit tests for various CSV changes.
Diffstat (limited to 'modules/markup')
-rw-r--r-- | modules/markup/csv/csv.go | 103 | ||||
-rw-r--r-- | modules/markup/csv/csv_test.go | 12 | ||||
-rw-r--r-- | modules/markup/sanitizer.go | 4 |
3 files changed, 48 insertions, 71 deletions
diff --git a/modules/markup/csv/csv.go b/modules/markup/csv/csv.go index 1e3acc9b47..430e759eb5 100644 --- a/modules/markup/csv/csv.go +++ b/modules/markup/csv/csv.go @@ -6,24 +6,20 @@ package markup import ( "bytes" - "encoding/csv" "html" "io" - "regexp" - "strings" + "strconv" + "code.gitea.io/gitea/modules/csv" "code.gitea.io/gitea/modules/markup" - "code.gitea.io/gitea/modules/util" + "code.gitea.io/gitea/modules/setting" ) -var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`) - func init() { markup.RegisterParser(Parser{}) - } -// Parser implements markup.Parser for orgmode +// Parser implements markup.Parser for csv files type Parser struct { } @@ -38,11 +34,35 @@ func (Parser) Extensions() []string { } // Render implements markup.Parser -func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte { - rd := csv.NewReader(bytes.NewReader(rawBytes)) - rd.Comma = p.bestDelimiter(rawBytes) +func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte { var tmpBlock bytes.Buffer - tmpBlock.WriteString(`<table class="table">`) + + if setting.UI.CSV.MaxFileSize != 0 && setting.UI.CSV.MaxFileSize < int64(len(rawBytes)) { + tmpBlock.WriteString("<pre>") + tmpBlock.WriteString(html.EscapeString(string(rawBytes))) + tmpBlock.WriteString("</pre>") + return tmpBlock.Bytes() + } + + rd := csv.CreateReaderAndGuessDelimiter(rawBytes) + + writeField := func(element, class, field string) { + tmpBlock.WriteString("<") + tmpBlock.WriteString(element) + if len(class) > 0 { + tmpBlock.WriteString(" class=\"") + tmpBlock.WriteString(class) + tmpBlock.WriteString("\"") + } + tmpBlock.WriteString(">") + tmpBlock.WriteString(html.EscapeString(field)) + tmpBlock.WriteString("</") + tmpBlock.WriteString(element) + tmpBlock.WriteString(">") + } + + tmpBlock.WriteString(`<table class="data-table">`) + row := 1 for { fields, err := rd.Read() if err == io.EOF { @@ -52,62 +72,19 @@ func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]strin continue } tmpBlock.WriteString("<tr>") + element := "td" + if row == 1 { + element = "th" + } + writeField(element, "line-num", strconv.Itoa(row)) for _, field := range fields { - tmpBlock.WriteString("<td>") - tmpBlock.WriteString(html.EscapeString(field)) - tmpBlock.WriteString("</td>") + writeField(element, "", field) } tmpBlock.WriteString("</tr>") + + row++ } tmpBlock.WriteString("</table>") return tmpBlock.Bytes() } - -// bestDelimiter scores the input CSV data against delimiters, and returns the best match. -// Reads at most 10k bytes & 10 lines. -func (p Parser) bestDelimiter(data []byte) rune { - maxLines := 10 - maxBytes := util.Min(len(data), 1e4) - text := string(data[:maxBytes]) - text = quoteRegexp.ReplaceAllLiteralString(text, "") - lines := strings.SplitN(text, "\n", maxLines+1) - lines = lines[:util.Min(maxLines, len(lines))] - - delimiters := []rune{',', ';', '\t', '|'} - bestDelim := delimiters[0] - bestScore := 0.0 - for _, delim := range delimiters { - score := p.scoreDelimiter(lines, delim) - if score > bestScore { - bestScore = score - bestDelim = delim - } - } - - return bestDelim -} - -// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV -func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) { - countTotal := 0 - countLineMax := 0 - linesNotEqual := 0 - - for _, line := range lines { - if len(line) == 0 { - continue - } - - countLine := strings.Count(line, string(delim)) - countTotal += countLine - if countLine != countLineMax { - if countLineMax != 0 { - linesNotEqual++ - } - countLineMax = util.Max(countLine, countLineMax) - } - } - - return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines))) -} diff --git a/modules/markup/csv/csv_test.go b/modules/markup/csv/csv_test.go index 4d4e0871e9..5438ebdf5c 100644 --- a/modules/markup/csv/csv_test.go +++ b/modules/markup/csv/csv_test.go @@ -13,14 +13,10 @@ import ( func TestRenderCSV(t *testing.T) { var parser Parser var kases = map[string]string{ - "a": "<table class=\"table\"><tr><td>a</td></tr></table>", - "1,2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>", - "1;2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>", - "1\t2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>", - "1|2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>", - "1,2,3;4,5,6;7,8,9\na;b;c": "<table class=\"table\"><tr><td>1,2,3</td><td>4,5,6</td><td>7,8,9</td></tr><tr><td>a</td><td>b</td><td>c</td></tr></table>", - "\"1,2,3,4\";\"a\nb\"\nc;d": "<table class=\"table\"><tr><td>1,2,3,4</td><td>a\nb</td></tr><tr><td>c</td><td>d</td></tr></table>", - "<br/>": "<table class=\"table\"><tr><td><br/></td></tr></table>", + "a": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>a</th></tr></table>", + "1,2": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr></table>", + "1;2\n3;4": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr><tr><td class=\"line-num\">2</td><td>3</td><td>4</td></tr></table>", + "<br/>": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th><br/></th></tr></table>", } for k, v := range kases { diff --git a/modules/markup/sanitizer.go b/modules/markup/sanitizer.go index 9214a75fb3..19feaa3cce 100644 --- a/modules/markup/sanitizer.go +++ b/modules/markup/sanitizer.go @@ -69,6 +69,10 @@ func ReplaceSanitizer() { // Allow icons, emojis, and chroma syntax on span sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`^((icon(\s+[\p{L}\p{N}_-]+)+)|(emoji))$|^([a-z][a-z0-9]{0,2})$`)).OnElements("span") + // Allow data tables + sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`data-table`)).OnElements("table") + sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`line-num`)).OnElements("th", "td") + // Allow generally safe attributes generalSafeAttrs := []string{"abbr", "accept", "accept-charset", "accesskey", "action", "align", "alt", |