]> source.dussan.org Git - gitea.git/commitdiff
Detect delimiter in CSV rendering (#7869)
authorNorwin <noerw@users.noreply.github.com>
Thu, 15 Aug 2019 22:09:50 +0000 (22:09 +0000)
committerLauris BH <lauris@nix.lv>
Thu, 15 Aug 2019 22:09:50 +0000 (01:09 +0300)
* detect csv delimiter in csv rendering

fixes #7868

* make linter happy

* fix failing testcase & use ints where possible

* expose markup type to template

previously all markup had the .markdown class, which is incorrect,
as it applies markdown CSS & JS logic to CSV rendering

* fix build (missing `make css`)

* ignore quoted csv content for delimiter scoring

also fix html generation

modules/markup/csv/csv.go
modules/markup/csv/csv_test.go
public/css/index.css
public/less/_repository.less
routers/repo/view.go
templates/repo/view_file.tmpl

index 077947e774425a7b9eef33c95ad290dcb9803c85..1e3acc9b47e2fdba9e444c1cd7a96227766ca819 100644 (file)
@@ -9,12 +9,18 @@ import (
        "encoding/csv"
        "html"
        "io"
+       "regexp"
+       "strings"
 
        "code.gitea.io/gitea/modules/markup"
+       "code.gitea.io/gitea/modules/util"
 )
 
+var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
+
 func init() {
        markup.RegisterParser(Parser{})
+
 }
 
 // Parser implements markup.Parser for orgmode
@@ -28,12 +34,13 @@ func (Parser) Name() string {
 
 // Extensions implements markup.Parser
 func (Parser) Extensions() []string {
-       return []string{".csv"}
+       return []string{".csv", ".tsv"}
 }
 
 // Render implements markup.Parser
-func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
+func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
        rd := csv.NewReader(bytes.NewReader(rawBytes))
+       rd.Comma = p.bestDelimiter(rawBytes)
        var tmpBlock bytes.Buffer
        tmpBlock.WriteString(`<table class="table">`)
        for {
@@ -50,9 +57,57 @@ func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string,
                        tmpBlock.WriteString(html.EscapeString(field))
                        tmpBlock.WriteString("</td>")
                }
-               tmpBlock.WriteString("<tr>")
+               tmpBlock.WriteString("</tr>")
        }
        tmpBlock.WriteString("</table>")
 
        return tmpBlock.Bytes()
 }
+
+// bestDelimiter scores the input CSV data against delimiters, and returns the best match.
+// Reads at most 10k bytes & 10 lines.
+func (p Parser) bestDelimiter(data []byte) rune {
+       maxLines := 10
+       maxBytes := util.Min(len(data), 1e4)
+       text := string(data[:maxBytes])
+       text = quoteRegexp.ReplaceAllLiteralString(text, "")
+       lines := strings.SplitN(text, "\n", maxLines+1)
+       lines = lines[:util.Min(maxLines, len(lines))]
+
+       delimiters := []rune{',', ';', '\t', '|'}
+       bestDelim := delimiters[0]
+       bestScore := 0.0
+       for _, delim := range delimiters {
+               score := p.scoreDelimiter(lines, delim)
+               if score > bestScore {
+                       bestScore = score
+                       bestDelim = delim
+               }
+       }
+
+       return bestDelim
+}
+
+// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV
+func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) {
+       countTotal := 0
+       countLineMax := 0
+       linesNotEqual := 0
+
+       for _, line := range lines {
+               if len(line) == 0 {
+                       continue
+               }
+
+               countLine := strings.Count(line, string(delim))
+               countTotal += countLine
+               if countLine != countLineMax {
+                       if countLineMax != 0 {
+                               linesNotEqual++
+                       }
+                       countLineMax = util.Max(countLine, countLineMax)
+               }
+       }
+
+       return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
+}
index f050296cee1bb0fb009cdd27bc496b0695ca717f..4d4e0871e94d4b0d071caddd6308bfc6ec684334 100644 (file)
@@ -13,9 +13,14 @@ import (
 func TestRenderCSV(t *testing.T) {
        var parser Parser
        var kases = map[string]string{
-               "a":     "<table class=\"table\"><tr><td>a</td><tr></table>",
-               "1,2":   "<table class=\"table\"><tr><td>1</td><td>2</td><tr></table>",
-               "<br/>": "<table class=\"table\"><tr><td>&lt;br/&gt;</td><tr></table>",
+               "a":                         "<table class=\"table\"><tr><td>a</td></tr></table>",
+               "1,2":                       "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
+               "1;2":                       "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
+               "1\t2":                      "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
+               "1|2":                       "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
+               "1,2,3;4,5,6;7,8,9\na;b;c":  "<table class=\"table\"><tr><td>1,2,3</td><td>4,5,6</td><td>7,8,9</td></tr><tr><td>a</td><td>b</td><td>c</td></tr></table>",
+               "\"1,2,3,4\";\"a\nb\"\nc;d": "<table class=\"table\"><tr><td>1,2,3,4</td><td>a\nb</td></tr><tr><td>c</td><td>d</td></tr></table>",
+               "<br/>":                     "<table class=\"table\"><tr><td>&lt;br/&gt;</td></tr></table>",
        }
 
        for k, v := range kases {
index b19b85ad36af5c23959ffeb31a44647fdd4df93d..6b906cc4a8137ac4d1c04d14a5ea6f4612fe87c8 100644 (file)
@@ -489,6 +489,7 @@ footer .ui.left,footer .ui.right{line-height:40px}
 .repository.file.list .non-diff-file-content .view-raw img{padding:5px 5px 0 5px}
 .repository.file.list .non-diff-file-content .plain-text{padding:1em 2em 1em 2em}
 .repository.file.list .non-diff-file-content .plain-text pre{word-break:break-word;white-space:pre-wrap}
+.repository.file.list .non-diff-file-content .csv{overflow-x:auto}
 .repository.file.list .non-diff-file-content pre{overflow:auto}
 .repository.file.list .sidebar{padding-left:0}
 .repository.file.list .sidebar .octicon{width:16px}
index eb183c1626d285fc129768482263d99299b053f0..ef05beb6fa3999c3511b7587cf4a7f18afe75eaf 100644 (file)
                 }
             }
 
+            .csv {
+                overflow-x: auto;
+            }
+
             pre {
                 overflow: auto;
             }
index b534ae0e164055be34e08a62542d6832e68ad45a..9d24506b47ce99291e532f2dc990e8b69d379a3c 100644 (file)
@@ -162,8 +162,9 @@ func renderDirectory(ctx *context.Context, treeLink string) {
                                d, _ := ioutil.ReadAll(dataRc)
                                buf = charset.ToUTF8WithFallback(append(buf, d...))
 
-                               if markup.Type(readmeFile.Name()) != "" {
+                               if markupType := markup.Type(readmeFile.Name()); markupType != "" {
                                        ctx.Data["IsMarkup"] = true
+                                       ctx.Data["MarkupType"] = string(markupType)
                                        ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas()))
                                } else {
                                        ctx.Data["IsRenderedHTML"] = true
@@ -282,8 +283,9 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st
 
                readmeExist := markup.IsReadmeFile(blob.Name())
                ctx.Data["ReadmeExist"] = readmeExist
-               if markup.Type(blob.Name()) != "" {
+               if markupType := markup.Type(blob.Name()); markupType != "" {
                        ctx.Data["IsMarkup"] = true
+                       ctx.Data["MarkupType"] = markupType
                        ctx.Data["FileContent"] = string(markup.Render(blob.Name(), buf, path.Dir(treeLink), ctx.Repo.Repository.ComposeMetas()))
                } else if readmeExist {
                        ctx.Data["IsRenderedHTML"] = true
index 72b1ae7a84e50adafed3240b7c7cdf9337564bfe..895a72aaee5b099a05109a2b72f8e287589c5b9a 100644 (file)
@@ -45,7 +45,7 @@
                </div>
        </h4>
        <div class="ui attached table unstackable segment">
-               <div class="file-view {{if .IsMarkup}}markdown{{else if .IsRenderedHTML}}plain-text{{else if .IsTextFile}}code-view{{end}} has-emoji">
+               <div class="file-view {{if .IsMarkup}}{{.MarkupType}}{{else if .IsRenderedHTML}}plain-text{{else if .IsTextFile}}code-view{{end}} has-emoji">
                        {{if .IsMarkup}}
                                {{if .FileContent}}{{.FileContent | Safe}}{{end}}
                        {{else if .IsRenderedHTML}}