aboutsummaryrefslogtreecommitdiffstats
path: root/modules
diff options
context:
space:
mode:
authorKN4CK3R <KN4CK3R@users.noreply.github.com>2021-03-29 22:44:28 +0200
committerGitHub <noreply@github.com>2021-03-29 22:44:28 +0200
commit0c6137617fbf41ee6cb315f96a2acc2dd91203e8 (patch)
tree27c8d1304334f1783232166927093419079ecd2a /modules
parentd3b8127ad372bbce8d891d8893ffe6e834590751 (diff)
downloadgitea-0c6137617fbf41ee6cb315f96a2acc2dd91203e8.tar.gz
gitea-0c6137617fbf41ee6cb315f96a2acc2dd91203e8.zip
Add Tabular Diff for CSV files (#14661)
Implements request #14320 The rendering of CSV files does match the diff style. * Moved CSV logic into base package. * Added method to create a tabular diff. * Added CSV compare context. * Added CSV diff template. * Use new table style in CSV markup. * Added file size limit for CSV rendering. * Display CSV parser errors in diff. * Lazy read single file. * Lazy read rows for full diff. * Added unit tests for various CSV changes.
Diffstat (limited to 'modules')
-rw-r--r--modules/csv/csv.go93
-rw-r--r--modules/csv/csv_test.go40
-rw-r--r--modules/markup/csv/csv.go103
-rw-r--r--modules/markup/csv/csv_test.go12
-rw-r--r--modules/markup/sanitizer.go4
-rw-r--r--modules/setting/setting.go9
6 files changed, 190 insertions, 71 deletions
diff --git a/modules/csv/csv.go b/modules/csv/csv.go
new file mode 100644
index 0000000000..1aa78fdeec
--- /dev/null
+++ b/modules/csv/csv.go
@@ -0,0 +1,93 @@
+// Copyright 2021 The Gitea Authors. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package csv
+
+import (
+ "bytes"
+ "encoding/csv"
+ "errors"
+ "regexp"
+ "strings"
+
+ "code.gitea.io/gitea/modules/translation"
+ "code.gitea.io/gitea/modules/util"
+)
+
+var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
+
+// CreateReader creates a csv.Reader with the given delimiter.
+func CreateReader(rawBytes []byte, delimiter rune) *csv.Reader {
+ rd := csv.NewReader(bytes.NewReader(rawBytes))
+ rd.Comma = delimiter
+ rd.TrimLeadingSpace = true
+ return rd
+}
+
+// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
+func CreateReaderAndGuessDelimiter(rawBytes []byte) *csv.Reader {
+ delimiter := guessDelimiter(rawBytes)
+ return CreateReader(rawBytes, delimiter)
+}
+
+// guessDelimiter scores the input CSV data against delimiters, and returns the best match.
+// Reads at most 10k bytes & 10 lines.
+func guessDelimiter(data []byte) rune {
+ maxLines := 10
+ maxBytes := util.Min(len(data), 1e4)
+ text := string(data[:maxBytes])
+ text = quoteRegexp.ReplaceAllLiteralString(text, "")
+ lines := strings.SplitN(text, "\n", maxLines+1)
+ lines = lines[:util.Min(maxLines, len(lines))]
+
+ delimiters := []rune{',', ';', '\t', '|', '@'}
+ bestDelim := delimiters[0]
+ bestScore := 0.0
+ for _, delim := range delimiters {
+ score := scoreDelimiter(lines, delim)
+ if score > bestScore {
+ bestScore = score
+ bestDelim = delim
+ }
+ }
+
+ return bestDelim
+}
+
+// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV.
+func scoreDelimiter(lines []string, delim rune) float64 {
+ countTotal := 0
+ countLineMax := 0
+ linesNotEqual := 0
+
+ for _, line := range lines {
+ if len(line) == 0 {
+ continue
+ }
+
+ countLine := strings.Count(line, string(delim))
+ countTotal += countLine
+ if countLine != countLineMax {
+ if countLineMax != 0 {
+ linesNotEqual++
+ }
+ countLineMax = util.Max(countLine, countLineMax)
+ }
+ }
+
+ return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
+}
+
+// FormatError converts csv errors into readable messages.
+func FormatError(err error, locale translation.Locale) (string, error) {
+ var perr *csv.ParseError
+ if errors.As(err, &perr) {
+ if perr.Err == csv.ErrFieldCount {
+ return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil
+ }
+ return locale.Tr("repo.error.csv.unexpected", perr.Line, perr.Column), nil
+ }
+
+ return "", err
+}
diff --git a/modules/csv/csv_test.go b/modules/csv/csv_test.go
new file mode 100644
index 0000000000..3a7584e21d
--- /dev/null
+++ b/modules/csv/csv_test.go
@@ -0,0 +1,40 @@
+// Copyright 2021 The Gitea Authors. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package csv
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestCreateReader(t *testing.T) {
+ rd := CreateReader([]byte{}, ',')
+ assert.Equal(t, ',', rd.Comma)
+}
+
+func TestCreateReaderAndGuessDelimiter(t *testing.T) {
+ input := "a;b;c\n1;2;3\n4;5;6"
+
+ rd := CreateReaderAndGuessDelimiter([]byte(input))
+ assert.Equal(t, ';', rd.Comma)
+}
+
+func TestGuessDelimiter(t *testing.T) {
+ var kases = map[string]rune{
+ "a": ',',
+ "1,2": ',',
+ "1;2": ';',
+ "1\t2": '\t',
+ "1|2": '|',
+ "1,2,3;4,5,6;7,8,9\na;b;c": ';',
+ "\"1,2,3,4\";\"a\nb\"\nc;d": ';',
+ "<br/>": ',',
+ }
+
+ for k, v := range kases {
+ assert.EqualValues(t, guessDelimiter([]byte(k)), v)
+ }
+}
diff --git a/modules/markup/csv/csv.go b/modules/markup/csv/csv.go
index 1e3acc9b47..430e759eb5 100644
--- a/modules/markup/csv/csv.go
+++ b/modules/markup/csv/csv.go
@@ -6,24 +6,20 @@ package markup
import (
"bytes"
- "encoding/csv"
"html"
"io"
- "regexp"
- "strings"
+ "strconv"
+ "code.gitea.io/gitea/modules/csv"
"code.gitea.io/gitea/modules/markup"
- "code.gitea.io/gitea/modules/util"
+ "code.gitea.io/gitea/modules/setting"
)
-var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
-
func init() {
markup.RegisterParser(Parser{})
-
}
-// Parser implements markup.Parser for orgmode
+// Parser implements markup.Parser for csv files
type Parser struct {
}
@@ -38,11 +34,35 @@ func (Parser) Extensions() []string {
}
// Render implements markup.Parser
-func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
- rd := csv.NewReader(bytes.NewReader(rawBytes))
- rd.Comma = p.bestDelimiter(rawBytes)
+func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
var tmpBlock bytes.Buffer
- tmpBlock.WriteString(`<table class="table">`)
+
+ if setting.UI.CSV.MaxFileSize != 0 && setting.UI.CSV.MaxFileSize < int64(len(rawBytes)) {
+ tmpBlock.WriteString("<pre>")
+ tmpBlock.WriteString(html.EscapeString(string(rawBytes)))
+ tmpBlock.WriteString("</pre>")
+ return tmpBlock.Bytes()
+ }
+
+ rd := csv.CreateReaderAndGuessDelimiter(rawBytes)
+
+ writeField := func(element, class, field string) {
+ tmpBlock.WriteString("<")
+ tmpBlock.WriteString(element)
+ if len(class) > 0 {
+ tmpBlock.WriteString(" class=\"")
+ tmpBlock.WriteString(class)
+ tmpBlock.WriteString("\"")
+ }
+ tmpBlock.WriteString(">")
+ tmpBlock.WriteString(html.EscapeString(field))
+ tmpBlock.WriteString("</")
+ tmpBlock.WriteString(element)
+ tmpBlock.WriteString(">")
+ }
+
+ tmpBlock.WriteString(`<table class="data-table">`)
+ row := 1
for {
fields, err := rd.Read()
if err == io.EOF {
@@ -52,62 +72,19 @@ func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]strin
continue
}
tmpBlock.WriteString("<tr>")
+ element := "td"
+ if row == 1 {
+ element = "th"
+ }
+ writeField(element, "line-num", strconv.Itoa(row))
for _, field := range fields {
- tmpBlock.WriteString("<td>")
- tmpBlock.WriteString(html.EscapeString(field))
- tmpBlock.WriteString("</td>")
+ writeField(element, "", field)
}
tmpBlock.WriteString("</tr>")
+
+ row++
}
tmpBlock.WriteString("</table>")
return tmpBlock.Bytes()
}
-
-// bestDelimiter scores the input CSV data against delimiters, and returns the best match.
-// Reads at most 10k bytes & 10 lines.
-func (p Parser) bestDelimiter(data []byte) rune {
- maxLines := 10
- maxBytes := util.Min(len(data), 1e4)
- text := string(data[:maxBytes])
- text = quoteRegexp.ReplaceAllLiteralString(text, "")
- lines := strings.SplitN(text, "\n", maxLines+1)
- lines = lines[:util.Min(maxLines, len(lines))]
-
- delimiters := []rune{',', ';', '\t', '|'}
- bestDelim := delimiters[0]
- bestScore := 0.0
- for _, delim := range delimiters {
- score := p.scoreDelimiter(lines, delim)
- if score > bestScore {
- bestScore = score
- bestDelim = delim
- }
- }
-
- return bestDelim
-}
-
-// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV
-func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) {
- countTotal := 0
- countLineMax := 0
- linesNotEqual := 0
-
- for _, line := range lines {
- if len(line) == 0 {
- continue
- }
-
- countLine := strings.Count(line, string(delim))
- countTotal += countLine
- if countLine != countLineMax {
- if countLineMax != 0 {
- linesNotEqual++
- }
- countLineMax = util.Max(countLine, countLineMax)
- }
- }
-
- return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
-}
diff --git a/modules/markup/csv/csv_test.go b/modules/markup/csv/csv_test.go
index 4d4e0871e9..5438ebdf5c 100644
--- a/modules/markup/csv/csv_test.go
+++ b/modules/markup/csv/csv_test.go
@@ -13,14 +13,10 @@ import (
func TestRenderCSV(t *testing.T) {
var parser Parser
var kases = map[string]string{
- "a": "<table class=\"table\"><tr><td>a</td></tr></table>",
- "1,2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
- "1;2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
- "1\t2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
- "1|2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
- "1,2,3;4,5,6;7,8,9\na;b;c": "<table class=\"table\"><tr><td>1,2,3</td><td>4,5,6</td><td>7,8,9</td></tr><tr><td>a</td><td>b</td><td>c</td></tr></table>",
- "\"1,2,3,4\";\"a\nb\"\nc;d": "<table class=\"table\"><tr><td>1,2,3,4</td><td>a\nb</td></tr><tr><td>c</td><td>d</td></tr></table>",
- "<br/>": "<table class=\"table\"><tr><td>&lt;br/&gt;</td></tr></table>",
+ "a": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>a</th></tr></table>",
+ "1,2": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr></table>",
+ "1;2\n3;4": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr><tr><td class=\"line-num\">2</td><td>3</td><td>4</td></tr></table>",
+ "<br/>": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>&lt;br/&gt;</th></tr></table>",
}
for k, v := range kases {
diff --git a/modules/markup/sanitizer.go b/modules/markup/sanitizer.go
index 9214a75fb3..19feaa3cce 100644
--- a/modules/markup/sanitizer.go
+++ b/modules/markup/sanitizer.go
@@ -69,6 +69,10 @@ func ReplaceSanitizer() {
// Allow icons, emojis, and chroma syntax on span
sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`^((icon(\s+[\p{L}\p{N}_-]+)+)|(emoji))$|^([a-z][a-z0-9]{0,2})$`)).OnElements("span")
+ // Allow data tables
+ sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`data-table`)).OnElements("table")
+ sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`line-num`)).OnElements("th", "td")
+
// Allow generally safe attributes
generalSafeAttrs := []string{"abbr", "accept", "accept-charset",
"accesskey", "action", "align", "alt",
diff --git a/modules/setting/setting.go b/modules/setting/setting.go
index 6a9868713f..280987ed66 100644
--- a/modules/setting/setting.go
+++ b/modules/setting/setting.go
@@ -213,6 +213,10 @@ var (
Enabled bool `ini:"ENABLE_RENDER"`
} `ini:"ui.svg"`
+ CSV struct {
+ MaxFileSize int64
+ } `ini:"ui.csv"`
+
Admin struct {
UserPagingNum int
RepoPagingNum int
@@ -258,6 +262,11 @@ var (
}{
Enabled: true,
},
+ CSV: struct {
+ MaxFileSize int64
+ }{
+ MaxFileSize: 524288,
+ },
Admin: struct {
UserPagingNum int
RepoPagingNum int