]> source.dussan.org Git - gitea.git/commitdiff
Fix diff skipping lines (#13155)
authorzeripath <art27@cantab.net>
Sat, 17 Oct 2020 01:39:35 +0000 (02:39 +0100)
committerGitHub <noreply@github.com>
Sat, 17 Oct 2020 01:39:35 +0000 (21:39 -0400)
* Fix diff skipping lines

Backport #13154

ParsePatch previously just skipped all lines that start with "+++ " or "--- "
and makes no attempt to see these lines in context.

This PR rewrites ParsePatch to pay attention to context and position
within a patch, ensuring that --- and +++ are only skipped if
appropriate.

This PR also fixes several issues with incomplete files.

Fix https://codeberg.org/Codeberg/Community/issues/308
Fix #13153

Signed-off-by: Andrew Thornton <art27@cantab.net>
* Add testcase

Signed-off-by: Andrew Thornton <art27@cantab.net>
* fix comment

* simplify error handling

Signed-off-by: Andrew Thornton <art27@cantab.net>
* never return io.EOF

Signed-off-by: Andrew Thornton <art27@cantab.net>
Co-authored-by: techknowlogick <techknowlogick@gitea.io>
services/gitdiff/gitdiff.go
services/gitdiff/gitdiff_test.go

index 1882b48ed4c0c66a625019def86ea38809686cea..91105399db8b705ca17b3341ac0f05dd278ef890 100644 (file)
@@ -441,91 +441,252 @@ func (diff *Diff) LoadComments(issue *models.Issue, currentUser *models.User) er
 
 const cmdDiffHead = "diff --git "
 
-// ParsePatch builds a Diff object from a io.Reader and some
-// parameters.
-// TODO: move this function to gogits/git-module
+// ParsePatch builds a Diff object from a io.Reader and some parameters.
 func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*Diff, error) {
-       var (
-               diff       = &Diff{Files: make([]*DiffFile, 0)}
-               curFile    = &DiffFile{}
-               curSection = &DiffSection{
-                       Lines: make([]*DiffLine, 0, 10),
+       var curFile *DiffFile
+
+       diff := &Diff{Files: make([]*DiffFile, 0)}
+
+       sb := strings.Builder{}
+
+       // OK let's set a reasonable buffer size.
+       // This should be let's say at least the size of maxLineCharacters or 4096 whichever is larger.
+       readerSize := maxLineCharacters
+       if readerSize < 4096 {
+               readerSize = 4096
+       }
+
+       input := bufio.NewReaderSize(reader, readerSize)
+       line, err := input.ReadString('\n')
+       if err != nil {
+               if err == io.EOF {
+                       return diff, nil
+               }
+               return diff, err
+       }
+parsingLoop:
+       for {
+               // 1. A patch file always begins with `diff --git ` + `a/path b/path` (possibly quoted)
+               // if it does not we have bad input!
+               if !strings.HasPrefix(line, cmdDiffHead) {
+                       return diff, fmt.Errorf("Invalid first file line: %s", line)
                }
 
-               leftLine, rightLine int
-               lineCount           int
-               curFileLinesCount   int
-               curFileLFSPrefix    bool
-       )
+               // TODO: Handle skipping first n files
+               if len(diff.Files) >= maxFiles {
+                       diff.IsIncomplete = true
+                       _, err := io.Copy(ioutil.Discard, reader)
+                       if err != nil {
+                               // By the definition of io.Copy this never returns io.EOF
+                               return diff, fmt.Errorf("Copy: %v", err)
+                       }
+                       break parsingLoop
+               }
 
-       input := bufio.NewReader(reader)
-       isEOF := false
-       for !isEOF {
-               var linebuf bytes.Buffer
+               curFile = createDiffFile(diff, line)
+               diff.Files = append(diff.Files, curFile)
+
+               // 2. It is followed by one or more extended header lines:
+               //
+               //     old mode <mode>
+               //     new mode <mode>
+               //     deleted file mode <mode>
+               //     new file mode <mode>
+               //     copy from <path>
+               //     copy to <path>
+               //     rename from <path>
+               //     rename to <path>
+               //     similarity index <number>
+               //     dissimilarity index <number>
+               //     index <hash>..<hash> <mode>
+               //
+               // * <mode> 6-digit octal numbers including the file type and file permission bits.
+               // * <path> does not include the a/ and b/ prefixes
+               // * <number> percentage of unchanged lines for similarity, percentage of changed
+               //   lines dissimilarity as integer rounded down with terminal %. 100% => equal files.
+               // * The index line includes the blob object names before and after the change.
+               //   The <mode> is included if the file mode does not change; otherwise, separate
+               //   lines indicate the old and the new mode.
+               // 3. Following this header the "standard unified" diff format header may be encountered: (but not for every case...)
+               //
+               //     --- a/<path>
+               //     +++ b/<path>
+               //
+               // With multiple hunks
+               //
+               //     @@ <hunk descriptor> @@
+               //     +added line
+               //     -removed line
+               //      unchanged line
+               //
+               // 4. Binary files get:
+               //
+               //     Binary files a/<path> and b/<path> differ
+               //
+               // but one of a/<path> and b/<path> could be /dev/null.
+       curFileLoop:
                for {
-                       b, err := input.ReadByte()
+                       line, err = input.ReadString('\n')
                        if err != nil {
-                               if err == io.EOF {
-                                       isEOF = true
-                                       break
-                               } else {
-                                       return nil, fmt.Errorf("ReadByte: %v", err)
+                               if err != io.EOF {
+                                       return diff, err
                                }
+                               break parsingLoop
                        }
-                       if b == '\n' {
-                               break
-                       }
-                       if linebuf.Len() < maxLineCharacters {
-                               linebuf.WriteByte(b)
-                       } else if linebuf.Len() == maxLineCharacters {
-                               curFile.IsIncomplete = true
+                       switch {
+                       case strings.HasPrefix(line, "old mode ") ||
+                               strings.HasPrefix(line, "new mode "):
+                               if strings.HasSuffix(line, " 160000\n") {
+                                       curFile.IsSubmodule = true
+                               }
+                       case strings.HasPrefix(line, "copy from "):
+                               curFile.IsRenamed = true
+                               curFile.Type = DiffFileCopy
+                       case strings.HasPrefix(line, "copy to "):
+                               curFile.IsRenamed = true
+                               curFile.Type = DiffFileCopy
+                       case strings.HasPrefix(line, "new file"):
+                               curFile.Type = DiffFileAdd
+                               curFile.IsCreated = true
+                               if strings.HasSuffix(line, " 160000\n") {
+                                       curFile.IsSubmodule = true
+                               }
+                       case strings.HasPrefix(line, "deleted"):
+                               curFile.Type = DiffFileDel
+                               curFile.IsDeleted = true
+                               if strings.HasSuffix(line, " 160000\n") {
+                                       curFile.IsSubmodule = true
+                               }
+                       case strings.HasPrefix(line, "index"):
+                               if strings.HasSuffix(line, " 160000\n") {
+                                       curFile.IsSubmodule = true
+                               }
+                       case strings.HasPrefix(line, "similarity index 100%"):
+                               curFile.Type = DiffFileRename
+                       case strings.HasPrefix(line, "Binary"):
+                               curFile.IsBin = true
+                       case strings.HasPrefix(line, "--- "):
+                               // Do nothing with this line
+                       case strings.HasPrefix(line, "+++ "):
+                               // Do nothing with this line
+                               lineBytes, isFragment, err := parseHunks(curFile, maxLines, maxLineCharacters, input)
+                               diff.TotalAddition += curFile.Addition
+                               diff.TotalDeletion += curFile.Deletion
+                               if err != nil {
+                                       if err != io.EOF {
+                                               return diff, err
+                                       }
+                                       break parsingLoop
+                               }
+                               sb.Reset()
+                               _, _ = sb.Write(lineBytes)
+                               for isFragment {
+                                       lineBytes, isFragment, err = input.ReadLine()
+                                       if err != nil {
+                                               // Now by the definition of ReadLine this cannot be io.EOF
+                                               return diff, fmt.Errorf("Unable to ReadLine: %v", err)
+                                       }
+                                       _, _ = sb.Write(lineBytes)
+                               }
+                               line = sb.String()
+                               sb.Reset()
+
+                               break curFileLoop
                        }
                }
-               line := linebuf.String()
 
-               if strings.HasPrefix(line, "+++ ") || strings.HasPrefix(line, "--- ") || len(line) == 0 {
-                       continue
+       }
+
+       // FIXME: There are numerous issues with this:
+       // - we might want to consider detecting encoding while parsing but...
+       // - we're likely to fail to get the correct encoding here anyway as we won't have enough information
+       // - and this doesn't really account for changes in encoding
+       var buf bytes.Buffer
+       for _, f := range diff.Files {
+               buf.Reset()
+               for _, sec := range f.Sections {
+                       for _, l := range sec.Lines {
+                               if l.Type == DiffLineSection {
+                                       continue
+                               }
+                               buf.WriteString(l.Content[1:])
+                               buf.WriteString("\n")
+                       }
                }
+               charsetLabel, err := charset.DetectEncoding(buf.Bytes())
+               if charsetLabel != "UTF-8" && err == nil {
+                       encoding, _ := stdcharset.Lookup(charsetLabel)
+                       if encoding != nil {
+                               d := encoding.NewDecoder()
+                               for _, sec := range f.Sections {
+                                       for _, l := range sec.Lines {
+                                               if l.Type == DiffLineSection {
+                                                       continue
+                                               }
+                                               if c, _, err := transform.String(d, l.Content[1:]); err == nil {
+                                                       l.Content = l.Content[0:1] + c
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
 
-               trimLine := strings.Trim(line, "+- ")
+       diff.NumFiles = len(diff.Files)
+       return diff, nil
+}
 
-               if trimLine == models.LFSMetaFileIdentifier {
-                       curFileLFSPrefix = true
-               }
+func parseHunks(curFile *DiffFile, maxLines, maxLineCharacters int, input *bufio.Reader) (lineBytes []byte, isFragment bool, err error) {
+       sb := strings.Builder{}
 
-               if curFileLFSPrefix && strings.HasPrefix(trimLine, models.LFSMetaFileOidPrefix) {
-                       oid := strings.TrimPrefix(trimLine, models.LFSMetaFileOidPrefix)
+       var (
+               curSection        *DiffSection
+               curFileLinesCount int
+               curFileLFSPrefix  bool
+       )
 
-                       if len(oid) == 64 {
-                               m := &models.LFSMetaObject{Oid: oid}
-                               count, err := models.Count(m)
+       leftLine, rightLine := 1, 1
 
-                               if err == nil && count > 0 {
-                                       curFile.IsBin = true
-                                       curFile.IsLFSFile = true
-                                       curSection.Lines = nil
-                               }
+       for {
+               sb.Reset()
+               lineBytes, isFragment, err = input.ReadLine()
+               if err != nil {
+                       if err == io.EOF {
+                               return
                        }
+                       err = fmt.Errorf("Unable to ReadLine: %v", err)
+                       return
+               }
+               if lineBytes[0] == 'd' {
+                       // End of hunks
+                       return
                }
 
-               curFileLinesCount++
-               lineCount++
+               switch lineBytes[0] {
+               case '@':
+                       if curFileLinesCount >= maxLines {
+                               curFile.IsIncomplete = true
+                               continue
+                       }
 
-               // Diff data too large, we only show the first about maxLines lines
-               if curFileLinesCount >= maxLines {
-                       curFile.IsIncomplete = true
-               }
-               switch {
-               case line[0] == ' ':
-                       diffLine := &DiffLine{Type: DiffLinePlain, Content: line, LeftIdx: leftLine, RightIdx: rightLine}
-                       leftLine++
-                       rightLine++
-                       curSection.Lines = append(curSection.Lines, diffLine)
-                       curSection.FileName = curFile.Name
-                       continue
-               case line[0] == '@':
+                       _, _ = sb.Write(lineBytes)
+                       for isFragment {
+                               // This is very odd indeed - we're in a section header and the line is too long
+                               // This really shouldn't happen...
+                               lineBytes, isFragment, err = input.ReadLine()
+                               if err != nil {
+                                       // Now by the definition of ReadLine this cannot be io.EOF
+                                       err = fmt.Errorf("Unable to ReadLine: %v", err)
+                                       return
+                               }
+                               _, _ = sb.Write(lineBytes)
+                       }
+                       line := sb.String()
+
+                       // Create a new section to represent this hunk
                        curSection = &DiffSection{}
                        curFile.Sections = append(curFile.Sections, curSection)
+
                        lineSectionInfo := getDiffLineSectionInfo(curFile.Name, line, leftLine-1, rightLine-1)
                        diffLine := &DiffLine{
                                Type:        DiffLineSection,
@@ -538,148 +699,132 @@ func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*D
                        leftLine = lineSectionInfo.LeftIdx
                        rightLine = lineSectionInfo.RightIdx
                        continue
-               case line[0] == '+':
+               case '\\':
+                       if curFileLinesCount >= maxLines {
+                               curFile.IsIncomplete = true
+                               continue
+                       }
+                       // This is used only to indicate that the current file does not have a terminal newline
+                       if !bytes.Equal(lineBytes, []byte("\\ No newline at end of file")) {
+                               err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes))
+                               return
+                       }
+                       // Technically this should be the end the file!
+                       // FIXME: we should be putting a marker at the end of the file if there is no terminal new line
+                       continue
+               case '+':
+                       curFileLinesCount++
                        curFile.Addition++
-                       diff.TotalAddition++
-                       diffLine := &DiffLine{Type: DiffLineAdd, Content: line, RightIdx: rightLine}
+                       if curFileLinesCount >= maxLines {
+                               curFile.IsIncomplete = true
+                               continue
+                       }
+                       diffLine := &DiffLine{Type: DiffLineAdd, RightIdx: rightLine}
                        rightLine++
                        curSection.Lines = append(curSection.Lines, diffLine)
-                       curSection.FileName = curFile.Name
-                       continue
-               case line[0] == '-':
+               case '-':
+                       curFileLinesCount++
                        curFile.Deletion++
-                       diff.TotalDeletion++
-                       diffLine := &DiffLine{Type: DiffLineDel, Content: line, LeftIdx: leftLine}
+                       if curFileLinesCount >= maxLines {
+                               curFile.IsIncomplete = true
+                               continue
+                       }
+                       diffLine := &DiffLine{Type: DiffLineDel, LeftIdx: leftLine}
                        if leftLine > 0 {
                                leftLine++
                        }
                        curSection.Lines = append(curSection.Lines, diffLine)
-                       curSection.FileName = curFile.Name
-               case strings.HasPrefix(line, "Binary"):
-                       curFile.IsBin = true
-                       continue
+               case ' ':
+                       curFileLinesCount++
+                       if curFileLinesCount >= maxLines {
+                               curFile.IsIncomplete = true
+                               continue
+                       }
+                       diffLine := &DiffLine{Type: DiffLinePlain, LeftIdx: leftLine, RightIdx: rightLine}
+                       leftLine++
+                       rightLine++
+                       curSection.Lines = append(curSection.Lines, diffLine)
+               default:
+                       // This is unexpected
+                       err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes))
+                       return
                }
 
-               // Get new file.
-               if strings.HasPrefix(line, cmdDiffHead) {
-                       if len(diff.Files) >= maxFiles {
-                               diff.IsIncomplete = true
-                               _, err := io.Copy(ioutil.Discard, reader)
+               line := string(lineBytes)
+               if isFragment {
+                       curFile.IsIncomplete = true
+                       for isFragment {
+                               lineBytes, isFragment, err = input.ReadLine()
                                if err != nil {
-                                       return nil, fmt.Errorf("Copy: %v", err)
+                                       // Now by the definition of ReadLine this cannot be io.EOF
+                                       err = fmt.Errorf("Unable to ReadLine: %v", err)
+                                       return
                                }
-                               break
                        }
+               }
+               curSection.Lines[len(curSection.Lines)-1].Content = line
 
-                       // Note: In case file name is surrounded by double quotes (it happens only in git-shell).
-                       // e.g. diff --git "a/xxx" "b/xxx"
-                       var a string
-                       var b string
-
-                       rd := strings.NewReader(line[len(cmdDiffHead):])
-                       char, _ := rd.ReadByte()
-                       _ = rd.UnreadByte()
-                       if char == '"' {
-                               fmt.Fscanf(rd, "%q ", &a)
-                               if a[0] == '\\' {
-                                       a = a[1:]
-                               }
-                       } else {
-                               fmt.Fscanf(rd, "%s ", &a)
-                       }
-                       char, _ = rd.ReadByte()
-                       _ = rd.UnreadByte()
-                       if char == '"' {
-                               fmt.Fscanf(rd, "%q", &b)
-                               if b[0] == '\\' {
-                                       b = b[1:]
-                               }
-                       } else {
-                               fmt.Fscanf(rd, "%s", &b)
-                       }
-                       a = a[2:]
-                       b = b[2:]
-
-                       curFile = &DiffFile{
-                               Name:      b,
-                               OldName:   a,
-                               Index:     len(diff.Files) + 1,
-                               Type:      DiffFileChange,
-                               Sections:  make([]*DiffSection, 0, 10),
-                               IsRenamed: a != b,
-                       }
-                       diff.Files = append(diff.Files, curFile)
-                       curFileLinesCount = 0
-                       leftLine = 1
-                       rightLine = 1
-                       curFileLFSPrefix = false
-
-                       // Check file diff type and is submodule.
-                       for {
-                               line, err := input.ReadString('\n')
-                               if err != nil {
-                                       if err == io.EOF {
-                                               isEOF = true
-                                       } else {
-                                               return nil, fmt.Errorf("ReadString: %v", err)
-                                       }
-                               }
+               // handle LFS
+               if line[1:] == models.LFSMetaFileIdentifier {
+                       curFileLFSPrefix = true
+               } else if curFileLFSPrefix && strings.HasPrefix(line[1:], models.LFSMetaFileOidPrefix) {
+                       oid := strings.TrimPrefix(line[1:], models.LFSMetaFileOidPrefix)
+                       if len(oid) == 64 {
+                               m := &models.LFSMetaObject{Oid: oid}
+                               count, err := models.Count(m)
 
-                               switch {
-                               case strings.HasPrefix(line, "copy from "):
-                                       curFile.IsRenamed = true
-                                       curFile.Type = DiffFileCopy
-                               case strings.HasPrefix(line, "copy to "):
-                                       curFile.IsRenamed = true
-                                       curFile.Type = DiffFileCopy
-                               case strings.HasPrefix(line, "new file"):
-                                       curFile.Type = DiffFileAdd
-                                       curFile.IsCreated = true
-                               case strings.HasPrefix(line, "deleted"):
-                                       curFile.Type = DiffFileDel
-                                       curFile.IsDeleted = true
-                               case strings.HasPrefix(line, "index"):
-                                       curFile.Type = DiffFileChange
-                               case strings.HasPrefix(line, "similarity index 100%"):
-                                       curFile.Type = DiffFileRename
-                               }
-                               if curFile.Type > 0 {
-                                       if strings.HasSuffix(line, " 160000\n") {
-                                               curFile.IsSubmodule = true
-                                       }
-                                       break
+                               if err == nil && count > 0 {
+                                       curFile.IsBin = true
+                                       curFile.IsLFSFile = true
+                                       curSection.Lines = nil
                                }
                        }
                }
        }
+}
 
-       // FIXME: detect encoding while parsing.
-       var buf bytes.Buffer
-       for _, f := range diff.Files {
-               buf.Reset()
-               for _, sec := range f.Sections {
-                       for _, l := range sec.Lines {
-                               buf.WriteString(l.Content)
-                               buf.WriteString("\n")
-                       }
-               }
-               charsetLabel, err := charset.DetectEncoding(buf.Bytes())
-               if charsetLabel != "UTF-8" && err == nil {
-                       encoding, _ := stdcharset.Lookup(charsetLabel)
-                       if encoding != nil {
-                               d := encoding.NewDecoder()
-                               for _, sec := range f.Sections {
-                                       for _, l := range sec.Lines {
-                                               if c, _, err := transform.String(d, l.Content); err == nil {
-                                                       l.Content = c
-                                               }
-                                       }
-                               }
-                       }
+func createDiffFile(diff *Diff, line string) *DiffFile {
+       // The a/ and b/ filenames are the same unless rename/copy is involved.
+       // Especially, even for a creation or a deletion, /dev/null is not used
+       // in place of the a/ or b/ filenames.
+       //
+       // When rename/copy is involved, file1 and file2 show the name of the
+       // source file of the rename/copy and the name of the file that rename/copy
+       // produces, respectively.
+       //
+       // Path names are quoted if necessary.
+       //
+       // This means that you should always be able to determine the file name even when there
+       // there is potential ambiguity...
+       //
+       // but we can be simpler with our heuristics by just forcing git to prefix things nicely
+       curFile := &DiffFile{
+               Index:    len(diff.Files) + 1,
+               Type:     DiffFileChange,
+               Sections: make([]*DiffSection, 0, 10),
+       }
+
+       rd := strings.NewReader(line[len(cmdDiffHead):] + " ")
+       curFile.Type = DiffFileChange
+       curFile.OldName = readFileName(rd)
+       curFile.Name = readFileName(rd)
+       curFile.IsRenamed = curFile.Name != curFile.OldName
+       return curFile
+}
+
+func readFileName(rd *strings.Reader) string {
+       var name string
+       char, _ := rd.ReadByte()
+       _ = rd.UnreadByte()
+       if char == '"' {
+               fmt.Fscanf(rd, "%q ", &name)
+               if name[0] == '\\' {
+                       name = name[1:]
                }
+       } else {
+               fmt.Fscanf(rd, "%s ", &name)
        }
-       diff.NumFiles = len(diff.Files)
-       return diff, nil
+       return name[2:]
 }
 
 // GetDiffRange builds a Diff between two commits of a repository.
index b90ad657f618ed8a02d16c63084261178951f8a9..64cd4f1c21f38134745fc23efb219891e3212690 100644 (file)
@@ -182,6 +182,27 @@ rename to a b/a a/file b/b file
                        oldFilename: "a b/file b/a a/file",
                        filename:    "a b/a a/file b/b file",
                },
+               {
+                       name: "minuses-and-pluses",
+                       gitdiff: `diff --git a/minuses-and-pluses b/minuses-and-pluses
+index 6961180..9ba1a00 100644
+--- a/minuses-and-pluses
++++ b/minuses-and-pluses
+@@ -1,4 +1,4 @@
+--- 1st line
+-++ 2nd line
+--- 3rd line
+-++ 4th line
++++ 1st line
++-- 2nd line
++++ 3rd line
++-- 4th line
+`,
+                       oldFilename: "minuses-and-pluses",
+                       filename:    "minuses-and-pluses",
+                       addition:    4,
+                       deletion:    4,
+               },
        }
 
        for _, testcase := range tests {