diff options
author | KN4CK3R <KN4CK3R@users.noreply.github.com> | 2021-03-29 22:44:28 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-29 22:44:28 +0200 |
commit | 0c6137617fbf41ee6cb315f96a2acc2dd91203e8 (patch) | |
tree | 27c8d1304334f1783232166927093419079ecd2a /modules | |
parent | d3b8127ad372bbce8d891d8893ffe6e834590751 (diff) | |
download | gitea-0c6137617fbf41ee6cb315f96a2acc2dd91203e8.tar.gz gitea-0c6137617fbf41ee6cb315f96a2acc2dd91203e8.zip |
Add Tabular Diff for CSV files (#14661)
Implements request #14320 The rendering of CSV files does match the diff style.
* Moved CSV logic into base package.
* Added method to create a tabular diff.
* Added CSV compare context.
* Added CSV diff template.
* Use new table style in CSV markup.
* Added file size limit for CSV rendering.
* Display CSV parser errors in diff.
* Lazy read single file.
* Lazy read rows for full diff.
* Added unit tests for various CSV changes.
Diffstat (limited to 'modules')
-rw-r--r-- | modules/csv/csv.go | 93 | ||||
-rw-r--r-- | modules/csv/csv_test.go | 40 | ||||
-rw-r--r-- | modules/markup/csv/csv.go | 103 | ||||
-rw-r--r-- | modules/markup/csv/csv_test.go | 12 | ||||
-rw-r--r-- | modules/markup/sanitizer.go | 4 | ||||
-rw-r--r-- | modules/setting/setting.go | 9 |
6 files changed, 190 insertions, 71 deletions
diff --git a/modules/csv/csv.go b/modules/csv/csv.go new file mode 100644 index 0000000000..1aa78fdeec --- /dev/null +++ b/modules/csv/csv.go @@ -0,0 +1,93 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package csv + +import ( + "bytes" + "encoding/csv" + "errors" + "regexp" + "strings" + + "code.gitea.io/gitea/modules/translation" + "code.gitea.io/gitea/modules/util" +) + +var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`) + +// CreateReader creates a csv.Reader with the given delimiter. +func CreateReader(rawBytes []byte, delimiter rune) *csv.Reader { + rd := csv.NewReader(bytes.NewReader(rawBytes)) + rd.Comma = delimiter + rd.TrimLeadingSpace = true + return rd +} + +// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader. +func CreateReaderAndGuessDelimiter(rawBytes []byte) *csv.Reader { + delimiter := guessDelimiter(rawBytes) + return CreateReader(rawBytes, delimiter) +} + +// guessDelimiter scores the input CSV data against delimiters, and returns the best match. +// Reads at most 10k bytes & 10 lines. +func guessDelimiter(data []byte) rune { + maxLines := 10 + maxBytes := util.Min(len(data), 1e4) + text := string(data[:maxBytes]) + text = quoteRegexp.ReplaceAllLiteralString(text, "") + lines := strings.SplitN(text, "\n", maxLines+1) + lines = lines[:util.Min(maxLines, len(lines))] + + delimiters := []rune{',', ';', '\t', '|', '@'} + bestDelim := delimiters[0] + bestScore := 0.0 + for _, delim := range delimiters { + score := scoreDelimiter(lines, delim) + if score > bestScore { + bestScore = score + bestDelim = delim + } + } + + return bestDelim +} + +// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV. +func scoreDelimiter(lines []string, delim rune) float64 { + countTotal := 0 + countLineMax := 0 + linesNotEqual := 0 + + for _, line := range lines { + if len(line) == 0 { + continue + } + + countLine := strings.Count(line, string(delim)) + countTotal += countLine + if countLine != countLineMax { + if countLineMax != 0 { + linesNotEqual++ + } + countLineMax = util.Max(countLine, countLineMax) + } + } + + return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines))) +} + +// FormatError converts csv errors into readable messages. +func FormatError(err error, locale translation.Locale) (string, error) { + var perr *csv.ParseError + if errors.As(err, &perr) { + if perr.Err == csv.ErrFieldCount { + return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil + } + return locale.Tr("repo.error.csv.unexpected", perr.Line, perr.Column), nil + } + + return "", err +} diff --git a/modules/csv/csv_test.go b/modules/csv/csv_test.go new file mode 100644 index 0000000000..3a7584e21d --- /dev/null +++ b/modules/csv/csv_test.go @@ -0,0 +1,40 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package csv + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestCreateReader(t *testing.T) { + rd := CreateReader([]byte{}, ',') + assert.Equal(t, ',', rd.Comma) +} + +func TestCreateReaderAndGuessDelimiter(t *testing.T) { + input := "a;b;c\n1;2;3\n4;5;6" + + rd := CreateReaderAndGuessDelimiter([]byte(input)) + assert.Equal(t, ';', rd.Comma) +} + +func TestGuessDelimiter(t *testing.T) { + var kases = map[string]rune{ + "a": ',', + "1,2": ',', + "1;2": ';', + "1\t2": '\t', + "1|2": '|', + "1,2,3;4,5,6;7,8,9\na;b;c": ';', + "\"1,2,3,4\";\"a\nb\"\nc;d": ';', + "<br/>": ',', + } + + for k, v := range kases { + assert.EqualValues(t, guessDelimiter([]byte(k)), v) + } +} diff --git a/modules/markup/csv/csv.go b/modules/markup/csv/csv.go index 1e3acc9b47..430e759eb5 100644 --- a/modules/markup/csv/csv.go +++ b/modules/markup/csv/csv.go @@ -6,24 +6,20 @@ package markup import ( "bytes" - "encoding/csv" "html" "io" - "regexp" - "strings" + "strconv" + "code.gitea.io/gitea/modules/csv" "code.gitea.io/gitea/modules/markup" - "code.gitea.io/gitea/modules/util" + "code.gitea.io/gitea/modules/setting" ) -var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`) - func init() { markup.RegisterParser(Parser{}) - } -// Parser implements markup.Parser for orgmode +// Parser implements markup.Parser for csv files type Parser struct { } @@ -38,11 +34,35 @@ func (Parser) Extensions() []string { } // Render implements markup.Parser -func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte { - rd := csv.NewReader(bytes.NewReader(rawBytes)) - rd.Comma = p.bestDelimiter(rawBytes) +func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte { var tmpBlock bytes.Buffer - tmpBlock.WriteString(`<table class="table">`) + + if setting.UI.CSV.MaxFileSize != 0 && setting.UI.CSV.MaxFileSize < int64(len(rawBytes)) { + tmpBlock.WriteString("<pre>") + tmpBlock.WriteString(html.EscapeString(string(rawBytes))) + tmpBlock.WriteString("</pre>") + return tmpBlock.Bytes() + } + + rd := csv.CreateReaderAndGuessDelimiter(rawBytes) + + writeField := func(element, class, field string) { + tmpBlock.WriteString("<") + tmpBlock.WriteString(element) + if len(class) > 0 { + tmpBlock.WriteString(" class=\"") + tmpBlock.WriteString(class) + tmpBlock.WriteString("\"") + } + tmpBlock.WriteString(">") + tmpBlock.WriteString(html.EscapeString(field)) + tmpBlock.WriteString("</") + tmpBlock.WriteString(element) + tmpBlock.WriteString(">") + } + + tmpBlock.WriteString(`<table class="data-table">`) + row := 1 for { fields, err := rd.Read() if err == io.EOF { @@ -52,62 +72,19 @@ func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]strin continue } tmpBlock.WriteString("<tr>") + element := "td" + if row == 1 { + element = "th" + } + writeField(element, "line-num", strconv.Itoa(row)) for _, field := range fields { - tmpBlock.WriteString("<td>") - tmpBlock.WriteString(html.EscapeString(field)) - tmpBlock.WriteString("</td>") + writeField(element, "", field) } tmpBlock.WriteString("</tr>") + + row++ } tmpBlock.WriteString("</table>") return tmpBlock.Bytes() } - -// bestDelimiter scores the input CSV data against delimiters, and returns the best match. -// Reads at most 10k bytes & 10 lines. -func (p Parser) bestDelimiter(data []byte) rune { - maxLines := 10 - maxBytes := util.Min(len(data), 1e4) - text := string(data[:maxBytes]) - text = quoteRegexp.ReplaceAllLiteralString(text, "") - lines := strings.SplitN(text, "\n", maxLines+1) - lines = lines[:util.Min(maxLines, len(lines))] - - delimiters := []rune{',', ';', '\t', '|'} - bestDelim := delimiters[0] - bestScore := 0.0 - for _, delim := range delimiters { - score := p.scoreDelimiter(lines, delim) - if score > bestScore { - bestScore = score - bestDelim = delim - } - } - - return bestDelim -} - -// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV -func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) { - countTotal := 0 - countLineMax := 0 - linesNotEqual := 0 - - for _, line := range lines { - if len(line) == 0 { - continue - } - - countLine := strings.Count(line, string(delim)) - countTotal += countLine - if countLine != countLineMax { - if countLineMax != 0 { - linesNotEqual++ - } - countLineMax = util.Max(countLine, countLineMax) - } - } - - return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines))) -} diff --git a/modules/markup/csv/csv_test.go b/modules/markup/csv/csv_test.go index 4d4e0871e9..5438ebdf5c 100644 --- a/modules/markup/csv/csv_test.go +++ b/modules/markup/csv/csv_test.go @@ -13,14 +13,10 @@ import ( func TestRenderCSV(t *testing.T) { var parser Parser var kases = map[string]string{ - "a": "<table class=\"table\"><tr><td>a</td></tr></table>", - "1,2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>", - "1;2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>", - "1\t2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>", - "1|2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>", - "1,2,3;4,5,6;7,8,9\na;b;c": "<table class=\"table\"><tr><td>1,2,3</td><td>4,5,6</td><td>7,8,9</td></tr><tr><td>a</td><td>b</td><td>c</td></tr></table>", - "\"1,2,3,4\";\"a\nb\"\nc;d": "<table class=\"table\"><tr><td>1,2,3,4</td><td>a\nb</td></tr><tr><td>c</td><td>d</td></tr></table>", - "<br/>": "<table class=\"table\"><tr><td><br/></td></tr></table>", + "a": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>a</th></tr></table>", + "1,2": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr></table>", + "1;2\n3;4": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr><tr><td class=\"line-num\">2</td><td>3</td><td>4</td></tr></table>", + "<br/>": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th><br/></th></tr></table>", } for k, v := range kases { diff --git a/modules/markup/sanitizer.go b/modules/markup/sanitizer.go index 9214a75fb3..19feaa3cce 100644 --- a/modules/markup/sanitizer.go +++ b/modules/markup/sanitizer.go @@ -69,6 +69,10 @@ func ReplaceSanitizer() { // Allow icons, emojis, and chroma syntax on span sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`^((icon(\s+[\p{L}\p{N}_-]+)+)|(emoji))$|^([a-z][a-z0-9]{0,2})$`)).OnElements("span") + // Allow data tables + sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`data-table`)).OnElements("table") + sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`line-num`)).OnElements("th", "td") + // Allow generally safe attributes generalSafeAttrs := []string{"abbr", "accept", "accept-charset", "accesskey", "action", "align", "alt", diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 6a9868713f..280987ed66 100644 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -213,6 +213,10 @@ var ( Enabled bool `ini:"ENABLE_RENDER"` } `ini:"ui.svg"` + CSV struct { + MaxFileSize int64 + } `ini:"ui.csv"` + Admin struct { UserPagingNum int RepoPagingNum int @@ -258,6 +262,11 @@ var ( }{ Enabled: true, }, + CSV: struct { + MaxFileSize int64 + }{ + MaxFileSize: 524288, + }, Admin: struct { UserPagingNum int RepoPagingNum int |