diff options
author | KN4CK3R <KN4CK3R@users.noreply.github.com> | 2021-03-29 22:44:28 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-29 22:44:28 +0200 |
commit | 0c6137617fbf41ee6cb315f96a2acc2dd91203e8 (patch) | |
tree | 27c8d1304334f1783232166927093419079ecd2a /modules/csv/csv.go | |
parent | d3b8127ad372bbce8d891d8893ffe6e834590751 (diff) | |
download | gitea-0c6137617fbf41ee6cb315f96a2acc2dd91203e8.tar.gz gitea-0c6137617fbf41ee6cb315f96a2acc2dd91203e8.zip |
Add Tabular Diff for CSV files (#14661)
Implements request #14320 The rendering of CSV files does match the diff style.
* Moved CSV logic into base package.
* Added method to create a tabular diff.
* Added CSV compare context.
* Added CSV diff template.
* Use new table style in CSV markup.
* Added file size limit for CSV rendering.
* Display CSV parser errors in diff.
* Lazy read single file.
* Lazy read rows for full diff.
* Added unit tests for various CSV changes.
Diffstat (limited to 'modules/csv/csv.go')
-rw-r--r-- | modules/csv/csv.go | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/modules/csv/csv.go b/modules/csv/csv.go new file mode 100644 index 0000000000..1aa78fdeec --- /dev/null +++ b/modules/csv/csv.go @@ -0,0 +1,93 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package csv + +import ( + "bytes" + "encoding/csv" + "errors" + "regexp" + "strings" + + "code.gitea.io/gitea/modules/translation" + "code.gitea.io/gitea/modules/util" +) + +var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`) + +// CreateReader creates a csv.Reader with the given delimiter. +func CreateReader(rawBytes []byte, delimiter rune) *csv.Reader { + rd := csv.NewReader(bytes.NewReader(rawBytes)) + rd.Comma = delimiter + rd.TrimLeadingSpace = true + return rd +} + +// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader. +func CreateReaderAndGuessDelimiter(rawBytes []byte) *csv.Reader { + delimiter := guessDelimiter(rawBytes) + return CreateReader(rawBytes, delimiter) +} + +// guessDelimiter scores the input CSV data against delimiters, and returns the best match. +// Reads at most 10k bytes & 10 lines. +func guessDelimiter(data []byte) rune { + maxLines := 10 + maxBytes := util.Min(len(data), 1e4) + text := string(data[:maxBytes]) + text = quoteRegexp.ReplaceAllLiteralString(text, "") + lines := strings.SplitN(text, "\n", maxLines+1) + lines = lines[:util.Min(maxLines, len(lines))] + + delimiters := []rune{',', ';', '\t', '|', '@'} + bestDelim := delimiters[0] + bestScore := 0.0 + for _, delim := range delimiters { + score := scoreDelimiter(lines, delim) + if score > bestScore { + bestScore = score + bestDelim = delim + } + } + + return bestDelim +} + +// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV. +func scoreDelimiter(lines []string, delim rune) float64 { + countTotal := 0 + countLineMax := 0 + linesNotEqual := 0 + + for _, line := range lines { + if len(line) == 0 { + continue + } + + countLine := strings.Count(line, string(delim)) + countTotal += countLine + if countLine != countLineMax { + if countLineMax != 0 { + linesNotEqual++ + } + countLineMax = util.Max(countLine, countLineMax) + } + } + + return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines))) +} + +// FormatError converts csv errors into readable messages. +func FormatError(err error, locale translation.Locale) (string, error) { + var perr *csv.ParseError + if errors.As(err, &perr) { + if perr.Err == csv.ErrFieldCount { + return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil + } + return locale.Tr("repo.error.csv.unexpected", perr.Line, perr.Column), nil + } + + return "", err +} |