]> source.dussan.org Git - gitea.git/commitdiff
Detect encoding changes while parsing diff (#16330)
authorJimmy Praet <jimmy.praet@telenet.be>
Tue, 13 Jul 2021 01:13:52 +0000 (03:13 +0200)
committerGitHub <noreply@github.com>
Tue, 13 Jul 2021 01:13:52 +0000 (03:13 +0200)
* Detect encoding changes while parsing diff

services/gitdiff/gitdiff.go

index f8f0fd7e3b90b26553f5adbaf965c6dfa438b020..d50e41eb40279836c4469cc4f093a8c59d725585 100644 (file)
@@ -32,6 +32,7 @@ import (
 
        "github.com/sergi/go-diff/diffmatchpatch"
        stdcharset "golang.org/x/net/html/charset"
+       "golang.org/x/text/encoding"
        "golang.org/x/text/transform"
 )
 
@@ -883,35 +884,46 @@ parsingLoop:
 
        }
 
-       // FIXME: There are numerous issues with this:
+       // TODO: There are numerous issues with this:
        // - we might want to consider detecting encoding while parsing but...
        // - we're likely to fail to get the correct encoding here anyway as we won't have enough information
-       // - and this doesn't really account for changes in encoding
-       var buf bytes.Buffer
+       var diffLineTypeBuffers = make(map[DiffLineType]*bytes.Buffer, 3)
+       var diffLineTypeDecoders = make(map[DiffLineType]*encoding.Decoder, 3)
+       diffLineTypeBuffers[DiffLinePlain] = new(bytes.Buffer)
+       diffLineTypeBuffers[DiffLineAdd] = new(bytes.Buffer)
+       diffLineTypeBuffers[DiffLineDel] = new(bytes.Buffer)
        for _, f := range diff.Files {
-               buf.Reset()
+               for _, buffer := range diffLineTypeBuffers {
+                       buffer.Reset()
+               }
                for _, sec := range f.Sections {
                        for _, l := range sec.Lines {
                                if l.Type == DiffLineSection {
                                        continue
                                }
-                               buf.WriteString(l.Content[1:])
-                               buf.WriteString("\n")
+                               diffLineTypeBuffers[l.Type].WriteString(l.Content[1:])
+                               diffLineTypeBuffers[l.Type].WriteString("\n")
                        }
                }
-               charsetLabel, err := charset.DetectEncoding(buf.Bytes())
-               if charsetLabel != "UTF-8" && err == nil {
-                       encoding, _ := stdcharset.Lookup(charsetLabel)
-                       if encoding != nil {
-                               d := encoding.NewDecoder()
-                               for _, sec := range f.Sections {
-                                       for _, l := range sec.Lines {
-                                               if l.Type == DiffLineSection {
-                                                       continue
-                                               }
-                                               if c, _, err := transform.String(d, l.Content[1:]); err == nil {
-                                                       l.Content = l.Content[0:1] + c
-                                               }
+               for lineType, buffer := range diffLineTypeBuffers {
+                       diffLineTypeDecoders[lineType] = nil
+                       if buffer.Len() == 0 {
+                               continue
+                       }
+                       charsetLabel, err := charset.DetectEncoding(buffer.Bytes())
+                       if charsetLabel != "UTF-8" && err == nil {
+                               encoding, _ := stdcharset.Lookup(charsetLabel)
+                               if encoding != nil {
+                                       diffLineTypeDecoders[lineType] = encoding.NewDecoder()
+                               }
+                       }
+               }
+               for _, sec := range f.Sections {
+                       for _, l := range sec.Lines {
+                               decoder := diffLineTypeDecoders[l.Type]
+                               if decoder != nil {
+                                       if c, _, err := transform.String(decoder, l.Content[1:]); err == nil {
+                                               l.Content = l.Content[0:1] + c
                                        }
                                }
                        }