summaryrefslogtreecommitdiffstats
path: root/modules/csv
diff options
context:
space:
mode:
Diffstat (limited to 'modules/csv')
-rw-r--r--modules/csv/csv.go93
-rw-r--r--modules/csv/csv_test.go40
2 files changed, 133 insertions, 0 deletions
diff --git a/modules/csv/csv.go b/modules/csv/csv.go
new file mode 100644
index 0000000000..1aa78fdeec
--- /dev/null
+++ b/modules/csv/csv.go
@@ -0,0 +1,93 @@
+// Copyright 2021 The Gitea Authors. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package csv
+
+import (
+ "bytes"
+ "encoding/csv"
+ "errors"
+ "regexp"
+ "strings"
+
+ "code.gitea.io/gitea/modules/translation"
+ "code.gitea.io/gitea/modules/util"
+)
+
+var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
+
+// CreateReader creates a csv.Reader with the given delimiter.
+func CreateReader(rawBytes []byte, delimiter rune) *csv.Reader {
+ rd := csv.NewReader(bytes.NewReader(rawBytes))
+ rd.Comma = delimiter
+ rd.TrimLeadingSpace = true
+ return rd
+}
+
+// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
+func CreateReaderAndGuessDelimiter(rawBytes []byte) *csv.Reader {
+ delimiter := guessDelimiter(rawBytes)
+ return CreateReader(rawBytes, delimiter)
+}
+
+// guessDelimiter scores the input CSV data against delimiters, and returns the best match.
+// Reads at most 10k bytes & 10 lines.
+func guessDelimiter(data []byte) rune {
+ maxLines := 10
+ maxBytes := util.Min(len(data), 1e4)
+ text := string(data[:maxBytes])
+ text = quoteRegexp.ReplaceAllLiteralString(text, "")
+ lines := strings.SplitN(text, "\n", maxLines+1)
+ lines = lines[:util.Min(maxLines, len(lines))]
+
+ delimiters := []rune{',', ';', '\t', '|', '@'}
+ bestDelim := delimiters[0]
+ bestScore := 0.0
+ for _, delim := range delimiters {
+ score := scoreDelimiter(lines, delim)
+ if score > bestScore {
+ bestScore = score
+ bestDelim = delim
+ }
+ }
+
+ return bestDelim
+}
+
+// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV.
+func scoreDelimiter(lines []string, delim rune) float64 {
+ countTotal := 0
+ countLineMax := 0
+ linesNotEqual := 0
+
+ for _, line := range lines {
+ if len(line) == 0 {
+ continue
+ }
+
+ countLine := strings.Count(line, string(delim))
+ countTotal += countLine
+ if countLine != countLineMax {
+ if countLineMax != 0 {
+ linesNotEqual++
+ }
+ countLineMax = util.Max(countLine, countLineMax)
+ }
+ }
+
+ return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
+}
+
+// FormatError converts csv errors into readable messages.
+func FormatError(err error, locale translation.Locale) (string, error) {
+ var perr *csv.ParseError
+ if errors.As(err, &perr) {
+ if perr.Err == csv.ErrFieldCount {
+ return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil
+ }
+ return locale.Tr("repo.error.csv.unexpected", perr.Line, perr.Column), nil
+ }
+
+ return "", err
+}
diff --git a/modules/csv/csv_test.go b/modules/csv/csv_test.go
new file mode 100644
index 0000000000..3a7584e21d
--- /dev/null
+++ b/modules/csv/csv_test.go
@@ -0,0 +1,40 @@
+// Copyright 2021 The Gitea Authors. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package csv
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestCreateReader(t *testing.T) {
+ rd := CreateReader([]byte{}, ',')
+ assert.Equal(t, ',', rd.Comma)
+}
+
+func TestCreateReaderAndGuessDelimiter(t *testing.T) {
+ input := "a;b;c\n1;2;3\n4;5;6"
+
+ rd := CreateReaderAndGuessDelimiter([]byte(input))
+ assert.Equal(t, ';', rd.Comma)
+}
+
+func TestGuessDelimiter(t *testing.T) {
+ var kases = map[string]rune{
+ "a": ',',
+ "1,2": ',',
+ "1;2": ';',
+ "1\t2": '\t',
+ "1|2": '|',
+ "1,2,3;4,5,6;7,8,9\na;b;c": ';',
+ "\"1,2,3,4\";\"a\nb\"\nc;d": ';',
+ "<br/>": ',',
+ }
+
+ for k, v := range kases {
+ assert.EqualValues(t, guessDelimiter([]byte(k)), v)
+ }
+}