1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
// Copyright 2018 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package markup
import (
"bytes"
"encoding/csv"
"html"
"io"
"regexp"
"strings"
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/util"
)
var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
func init() {
markup.RegisterParser(Parser{})
}
// Parser implements markup.Parser for orgmode
type Parser struct {
}
// Name implements markup.Parser
func (Parser) Name() string {
return "csv"
}
// Extensions implements markup.Parser
func (Parser) Extensions() []string {
return []string{".csv", ".tsv"}
}
// Render implements markup.Parser
func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
rd := csv.NewReader(bytes.NewReader(rawBytes))
rd.Comma = p.bestDelimiter(rawBytes)
var tmpBlock bytes.Buffer
tmpBlock.WriteString(`<table class="table">`)
for {
fields, err := rd.Read()
if err == io.EOF {
break
}
if err != nil {
continue
}
tmpBlock.WriteString("<tr>")
for _, field := range fields {
tmpBlock.WriteString("<td>")
tmpBlock.WriteString(html.EscapeString(field))
tmpBlock.WriteString("</td>")
}
tmpBlock.WriteString("</tr>")
}
tmpBlock.WriteString("</table>")
return tmpBlock.Bytes()
}
// bestDelimiter scores the input CSV data against delimiters, and returns the best match.
// Reads at most 10k bytes & 10 lines.
func (p Parser) bestDelimiter(data []byte) rune {
maxLines := 10
maxBytes := util.Min(len(data), 1e4)
text := string(data[:maxBytes])
text = quoteRegexp.ReplaceAllLiteralString(text, "")
lines := strings.SplitN(text, "\n", maxLines+1)
lines = lines[:util.Min(maxLines, len(lines))]
delimiters := []rune{',', ';', '\t', '|'}
bestDelim := delimiters[0]
bestScore := 0.0
for _, delim := range delimiters {
score := p.scoreDelimiter(lines, delim)
if score > bestScore {
bestScore = score
bestDelim = delim
}
}
return bestDelim
}
// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV
func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) {
countTotal := 0
countLineMax := 0
linesNotEqual := 0
for _, line := range lines {
if len(line) == 0 {
continue
}
countLine := strings.Count(line, string(delim))
countTotal += countLine
if countLine != countLineMax {
if countLineMax != 0 {
linesNotEqual++
}
countLineMax = util.Max(countLine, countLineMax)
}
}
return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
}
|