diff options
Diffstat (limited to 'modules/csv/csv.go')
-rw-r--r-- | modules/csv/csv.go | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/modules/csv/csv.go b/modules/csv/csv.go new file mode 100644 index 0000000000..1aa78fdeec --- /dev/null +++ b/modules/csv/csv.go @@ -0,0 +1,93 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package csv + +import ( + "bytes" + "encoding/csv" + "errors" + "regexp" + "strings" + + "code.gitea.io/gitea/modules/translation" + "code.gitea.io/gitea/modules/util" +) + +var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`) + +// CreateReader creates a csv.Reader with the given delimiter. +func CreateReader(rawBytes []byte, delimiter rune) *csv.Reader { + rd := csv.NewReader(bytes.NewReader(rawBytes)) + rd.Comma = delimiter + rd.TrimLeadingSpace = true + return rd +} + +// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader. +func CreateReaderAndGuessDelimiter(rawBytes []byte) *csv.Reader { + delimiter := guessDelimiter(rawBytes) + return CreateReader(rawBytes, delimiter) +} + +// guessDelimiter scores the input CSV data against delimiters, and returns the best match. +// Reads at most 10k bytes & 10 lines. +func guessDelimiter(data []byte) rune { + maxLines := 10 + maxBytes := util.Min(len(data), 1e4) + text := string(data[:maxBytes]) + text = quoteRegexp.ReplaceAllLiteralString(text, "") + lines := strings.SplitN(text, "\n", maxLines+1) + lines = lines[:util.Min(maxLines, len(lines))] + + delimiters := []rune{',', ';', '\t', '|', '@'} + bestDelim := delimiters[0] + bestScore := 0.0 + for _, delim := range delimiters { + score := scoreDelimiter(lines, delim) + if score > bestScore { + bestScore = score + bestDelim = delim + } + } + + return bestDelim +} + +// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV. +func scoreDelimiter(lines []string, delim rune) float64 { + countTotal := 0 + countLineMax := 0 + linesNotEqual := 0 + + for _, line := range lines { + if len(line) == 0 { + continue + } + + countLine := strings.Count(line, string(delim)) + countTotal += countLine + if countLine != countLineMax { + if countLineMax != 0 { + linesNotEqual++ + } + countLineMax = util.Max(countLine, countLineMax) + } + } + + return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines))) +} + +// FormatError converts csv errors into readable messages. +func FormatError(err error, locale translation.Locale) (string, error) { + var perr *csv.ParseError + if errors.As(err, &perr) { + if perr.Err == csv.ErrFieldCount { + return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil + } + return locale.Tr("repo.error.csv.unexpected", perr.Line, perr.Column), nil + } + + return "", err +} |