summaryrefslogtreecommitdiffstats
path: root/services/gitdiff/gitdiff.go
diff options
context:
space:
mode:
authorzeripath <art27@cantab.net>2020-10-16 18:13:18 +0100
committerGitHub <noreply@github.com>2020-10-16 13:13:18 -0400
commit6bab678bed726e9d5371cd10050ea217ab076c24 (patch)
tree28f99213856802be65a965f91441742b0e20ae4a /services/gitdiff/gitdiff.go
parentb222dbc1d1df2559206ec1c27928d4666a48f0e8 (diff)
downloadgitea-6bab678bed726e9d5371cd10050ea217ab076c24.tar.gz
gitea-6bab678bed726e9d5371cd10050ea217ab076c24.zip
Fix diff skipping lines (#13154)
* Fix diff skipping lines ParsePatch previously just skipped all lines that start with "+++ " or "--- " and makes no attempt to see these lines in context. This PR rewrites ParsePatch to pay attention to context and position within a patch, ensuring that --- and +++ are only skipped if appropriate. This PR also fixes several issues with incomplete files. Fix https://codeberg.org/Codeberg/Community/issues/308 Fix #13153 Signed-off-by: Andrew Thornton <art27@cantab.net> * Add testcase Signed-off-by: Andrew Thornton <art27@cantab.net> * fix comment * simplify error handling Signed-off-by: Andrew Thornton <art27@cantab.net> * never return io.EOF Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Lauris BH <lauris@nix.lv>
Diffstat (limited to 'services/gitdiff/gitdiff.go')
-rw-r--r--services/gitdiff/gitdiff.go509
1 files changed, 327 insertions, 182 deletions
diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go
index 1882b48ed4..91105399db 100644
--- a/services/gitdiff/gitdiff.go
+++ b/services/gitdiff/gitdiff.go
@@ -441,91 +441,252 @@ func (diff *Diff) LoadComments(issue *models.Issue, currentUser *models.User) er
const cmdDiffHead = "diff --git "
-// ParsePatch builds a Diff object from a io.Reader and some
-// parameters.
-// TODO: move this function to gogits/git-module
+// ParsePatch builds a Diff object from a io.Reader and some parameters.
func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*Diff, error) {
- var (
- diff = &Diff{Files: make([]*DiffFile, 0)}
- curFile = &DiffFile{}
- curSection = &DiffSection{
- Lines: make([]*DiffLine, 0, 10),
+ var curFile *DiffFile
+
+ diff := &Diff{Files: make([]*DiffFile, 0)}
+
+ sb := strings.Builder{}
+
+ // OK let's set a reasonable buffer size.
+ // This should be let's say at least the size of maxLineCharacters or 4096 whichever is larger.
+ readerSize := maxLineCharacters
+ if readerSize < 4096 {
+ readerSize = 4096
+ }
+
+ input := bufio.NewReaderSize(reader, readerSize)
+ line, err := input.ReadString('\n')
+ if err != nil {
+ if err == io.EOF {
+ return diff, nil
+ }
+ return diff, err
+ }
+parsingLoop:
+ for {
+ // 1. A patch file always begins with `diff --git ` + `a/path b/path` (possibly quoted)
+ // if it does not we have bad input!
+ if !strings.HasPrefix(line, cmdDiffHead) {
+ return diff, fmt.Errorf("Invalid first file line: %s", line)
}
- leftLine, rightLine int
- lineCount int
- curFileLinesCount int
- curFileLFSPrefix bool
- )
+ // TODO: Handle skipping first n files
+ if len(diff.Files) >= maxFiles {
+ diff.IsIncomplete = true
+ _, err := io.Copy(ioutil.Discard, reader)
+ if err != nil {
+ // By the definition of io.Copy this never returns io.EOF
+ return diff, fmt.Errorf("Copy: %v", err)
+ }
+ break parsingLoop
+ }
- input := bufio.NewReader(reader)
- isEOF := false
- for !isEOF {
- var linebuf bytes.Buffer
+ curFile = createDiffFile(diff, line)
+ diff.Files = append(diff.Files, curFile)
+
+ // 2. It is followed by one or more extended header lines:
+ //
+ // old mode <mode>
+ // new mode <mode>
+ // deleted file mode <mode>
+ // new file mode <mode>
+ // copy from <path>
+ // copy to <path>
+ // rename from <path>
+ // rename to <path>
+ // similarity index <number>
+ // dissimilarity index <number>
+ // index <hash>..<hash> <mode>
+ //
+ // * <mode> 6-digit octal numbers including the file type and file permission bits.
+ // * <path> does not include the a/ and b/ prefixes
+ // * <number> percentage of unchanged lines for similarity, percentage of changed
+ // lines dissimilarity as integer rounded down with terminal %. 100% => equal files.
+ // * The index line includes the blob object names before and after the change.
+ // The <mode> is included if the file mode does not change; otherwise, separate
+ // lines indicate the old and the new mode.
+ // 3. Following this header the "standard unified" diff format header may be encountered: (but not for every case...)
+ //
+ // --- a/<path>
+ // +++ b/<path>
+ //
+ // With multiple hunks
+ //
+ // @@ <hunk descriptor> @@
+ // +added line
+ // -removed line
+ // unchanged line
+ //
+ // 4. Binary files get:
+ //
+ // Binary files a/<path> and b/<path> differ
+ //
+ // but one of a/<path> and b/<path> could be /dev/null.
+ curFileLoop:
for {
- b, err := input.ReadByte()
+ line, err = input.ReadString('\n')
if err != nil {
- if err == io.EOF {
- isEOF = true
- break
- } else {
- return nil, fmt.Errorf("ReadByte: %v", err)
+ if err != io.EOF {
+ return diff, err
}
+ break parsingLoop
}
- if b == '\n' {
- break
- }
- if linebuf.Len() < maxLineCharacters {
- linebuf.WriteByte(b)
- } else if linebuf.Len() == maxLineCharacters {
- curFile.IsIncomplete = true
+ switch {
+ case strings.HasPrefix(line, "old mode ") ||
+ strings.HasPrefix(line, "new mode "):
+ if strings.HasSuffix(line, " 160000\n") {
+ curFile.IsSubmodule = true
+ }
+ case strings.HasPrefix(line, "copy from "):
+ curFile.IsRenamed = true
+ curFile.Type = DiffFileCopy
+ case strings.HasPrefix(line, "copy to "):
+ curFile.IsRenamed = true
+ curFile.Type = DiffFileCopy
+ case strings.HasPrefix(line, "new file"):
+ curFile.Type = DiffFileAdd
+ curFile.IsCreated = true
+ if strings.HasSuffix(line, " 160000\n") {
+ curFile.IsSubmodule = true
+ }
+ case strings.HasPrefix(line, "deleted"):
+ curFile.Type = DiffFileDel
+ curFile.IsDeleted = true
+ if strings.HasSuffix(line, " 160000\n") {
+ curFile.IsSubmodule = true
+ }
+ case strings.HasPrefix(line, "index"):
+ if strings.HasSuffix(line, " 160000\n") {
+ curFile.IsSubmodule = true
+ }
+ case strings.HasPrefix(line, "similarity index 100%"):
+ curFile.Type = DiffFileRename
+ case strings.HasPrefix(line, "Binary"):
+ curFile.IsBin = true
+ case strings.HasPrefix(line, "--- "):
+ // Do nothing with this line
+ case strings.HasPrefix(line, "+++ "):
+ // Do nothing with this line
+ lineBytes, isFragment, err := parseHunks(curFile, maxLines, maxLineCharacters, input)
+ diff.TotalAddition += curFile.Addition
+ diff.TotalDeletion += curFile.Deletion
+ if err != nil {
+ if err != io.EOF {
+ return diff, err
+ }
+ break parsingLoop
+ }
+ sb.Reset()
+ _, _ = sb.Write(lineBytes)
+ for isFragment {
+ lineBytes, isFragment, err = input.ReadLine()
+ if err != nil {
+ // Now by the definition of ReadLine this cannot be io.EOF
+ return diff, fmt.Errorf("Unable to ReadLine: %v", err)
+ }
+ _, _ = sb.Write(lineBytes)
+ }
+ line = sb.String()
+ sb.Reset()
+
+ break curFileLoop
}
}
- line := linebuf.String()
- if strings.HasPrefix(line, "+++ ") || strings.HasPrefix(line, "--- ") || len(line) == 0 {
- continue
+ }
+
+ // FIXME: There are numerous issues with this:
+ // - we might want to consider detecting encoding while parsing but...
+ // - we're likely to fail to get the correct encoding here anyway as we won't have enough information
+ // - and this doesn't really account for changes in encoding
+ var buf bytes.Buffer
+ for _, f := range diff.Files {
+ buf.Reset()
+ for _, sec := range f.Sections {
+ for _, l := range sec.Lines {
+ if l.Type == DiffLineSection {
+ continue
+ }
+ buf.WriteString(l.Content[1:])
+ buf.WriteString("\n")
+ }
}
+ charsetLabel, err := charset.DetectEncoding(buf.Bytes())
+ if charsetLabel != "UTF-8" && err == nil {
+ encoding, _ := stdcharset.Lookup(charsetLabel)
+ if encoding != nil {
+ d := encoding.NewDecoder()
+ for _, sec := range f.Sections {
+ for _, l := range sec.Lines {
+ if l.Type == DiffLineSection {
+ continue
+ }
+ if c, _, err := transform.String(d, l.Content[1:]); err == nil {
+ l.Content = l.Content[0:1] + c
+ }
+ }
+ }
+ }
+ }
+ }
- trimLine := strings.Trim(line, "+- ")
+ diff.NumFiles = len(diff.Files)
+ return diff, nil
+}
- if trimLine == models.LFSMetaFileIdentifier {
- curFileLFSPrefix = true
- }
+func parseHunks(curFile *DiffFile, maxLines, maxLineCharacters int, input *bufio.Reader) (lineBytes []byte, isFragment bool, err error) {
+ sb := strings.Builder{}
- if curFileLFSPrefix && strings.HasPrefix(trimLine, models.LFSMetaFileOidPrefix) {
- oid := strings.TrimPrefix(trimLine, models.LFSMetaFileOidPrefix)
+ var (
+ curSection *DiffSection
+ curFileLinesCount int
+ curFileLFSPrefix bool
+ )
- if len(oid) == 64 {
- m := &models.LFSMetaObject{Oid: oid}
- count, err := models.Count(m)
+ leftLine, rightLine := 1, 1
- if err == nil && count > 0 {
- curFile.IsBin = true
- curFile.IsLFSFile = true
- curSection.Lines = nil
- }
+ for {
+ sb.Reset()
+ lineBytes, isFragment, err = input.ReadLine()
+ if err != nil {
+ if err == io.EOF {
+ return
}
+ err = fmt.Errorf("Unable to ReadLine: %v", err)
+ return
+ }
+ if lineBytes[0] == 'd' {
+ // End of hunks
+ return
}
- curFileLinesCount++
- lineCount++
+ switch lineBytes[0] {
+ case '@':
+ if curFileLinesCount >= maxLines {
+ curFile.IsIncomplete = true
+ continue
+ }
- // Diff data too large, we only show the first about maxLines lines
- if curFileLinesCount >= maxLines {
- curFile.IsIncomplete = true
- }
- switch {
- case line[0] == ' ':
- diffLine := &DiffLine{Type: DiffLinePlain, Content: line, LeftIdx: leftLine, RightIdx: rightLine}
- leftLine++
- rightLine++
- curSection.Lines = append(curSection.Lines, diffLine)
- curSection.FileName = curFile.Name
- continue
- case line[0] == '@':
+ _, _ = sb.Write(lineBytes)
+ for isFragment {
+ // This is very odd indeed - we're in a section header and the line is too long
+ // This really shouldn't happen...
+ lineBytes, isFragment, err = input.ReadLine()
+ if err != nil {
+ // Now by the definition of ReadLine this cannot be io.EOF
+ err = fmt.Errorf("Unable to ReadLine: %v", err)
+ return
+ }
+ _, _ = sb.Write(lineBytes)
+ }
+ line := sb.String()
+
+ // Create a new section to represent this hunk
curSection = &DiffSection{}
curFile.Sections = append(curFile.Sections, curSection)
+
lineSectionInfo := getDiffLineSectionInfo(curFile.Name, line, leftLine-1, rightLine-1)
diffLine := &DiffLine{
Type: DiffLineSection,
@@ -538,148 +699,132 @@ func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*D
leftLine = lineSectionInfo.LeftIdx
rightLine = lineSectionInfo.RightIdx
continue
- case line[0] == '+':
+ case '\\':
+ if curFileLinesCount >= maxLines {
+ curFile.IsIncomplete = true
+ continue
+ }
+ // This is used only to indicate that the current file does not have a terminal newline
+ if !bytes.Equal(lineBytes, []byte("\\ No newline at end of file")) {
+ err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes))
+ return
+ }
+ // Technically this should be the end the file!
+ // FIXME: we should be putting a marker at the end of the file if there is no terminal new line
+ continue
+ case '+':
+ curFileLinesCount++
curFile.Addition++
- diff.TotalAddition++
- diffLine := &DiffLine{Type: DiffLineAdd, Content: line, RightIdx: rightLine}
+ if curFileLinesCount >= maxLines {
+ curFile.IsIncomplete = true
+ continue
+ }
+ diffLine := &DiffLine{Type: DiffLineAdd, RightIdx: rightLine}
rightLine++
curSection.Lines = append(curSection.Lines, diffLine)
- curSection.FileName = curFile.Name
- continue
- case line[0] == '-':
+ case '-':
+ curFileLinesCount++
curFile.Deletion++
- diff.TotalDeletion++
- diffLine := &DiffLine{Type: DiffLineDel, Content: line, LeftIdx: leftLine}
+ if curFileLinesCount >= maxLines {
+ curFile.IsIncomplete = true
+ continue
+ }
+ diffLine := &DiffLine{Type: DiffLineDel, LeftIdx: leftLine}
if leftLine > 0 {
leftLine++
}
curSection.Lines = append(curSection.Lines, diffLine)
- curSection.FileName = curFile.Name
- case strings.HasPrefix(line, "Binary"):
- curFile.IsBin = true
- continue
+ case ' ':
+ curFileLinesCount++
+ if curFileLinesCount >= maxLines {
+ curFile.IsIncomplete = true
+ continue
+ }
+ diffLine := &DiffLine{Type: DiffLinePlain, LeftIdx: leftLine, RightIdx: rightLine}
+ leftLine++
+ rightLine++
+ curSection.Lines = append(curSection.Lines, diffLine)
+ default:
+ // This is unexpected
+ err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes))
+ return
}
- // Get new file.
- if strings.HasPrefix(line, cmdDiffHead) {
- if len(diff.Files) >= maxFiles {
- diff.IsIncomplete = true
- _, err := io.Copy(ioutil.Discard, reader)
+ line := string(lineBytes)
+ if isFragment {
+ curFile.IsIncomplete = true
+ for isFragment {
+ lineBytes, isFragment, err = input.ReadLine()
if err != nil {
- return nil, fmt.Errorf("Copy: %v", err)
+ // Now by the definition of ReadLine this cannot be io.EOF
+ err = fmt.Errorf("Unable to ReadLine: %v", err)
+ return
}
- break
}
+ }
+ curSection.Lines[len(curSection.Lines)-1].Content = line
- // Note: In case file name is surrounded by double quotes (it happens only in git-shell).
- // e.g. diff --git "a/xxx" "b/xxx"
- var a string
- var b string
-
- rd := strings.NewReader(line[len(cmdDiffHead):])
- char, _ := rd.ReadByte()
- _ = rd.UnreadByte()
- if char == '"' {
- fmt.Fscanf(rd, "%q ", &a)
- if a[0] == '\\' {
- a = a[1:]
- }
- } else {
- fmt.Fscanf(rd, "%s ", &a)
- }
- char, _ = rd.ReadByte()
- _ = rd.UnreadByte()
- if char == '"' {
- fmt.Fscanf(rd, "%q", &b)
- if b[0] == '\\' {
- b = b[1:]
- }
- } else {
- fmt.Fscanf(rd, "%s", &b)
- }
- a = a[2:]
- b = b[2:]
-
- curFile = &DiffFile{
- Name: b,
- OldName: a,
- Index: len(diff.Files) + 1,
- Type: DiffFileChange,
- Sections: make([]*DiffSection, 0, 10),
- IsRenamed: a != b,
- }
- diff.Files = append(diff.Files, curFile)
- curFileLinesCount = 0
- leftLine = 1
- rightLine = 1
- curFileLFSPrefix = false
-
- // Check file diff type and is submodule.
- for {
- line, err := input.ReadString('\n')
- if err != nil {
- if err == io.EOF {
- isEOF = true
- } else {
- return nil, fmt.Errorf("ReadString: %v", err)
- }
- }
+ // handle LFS
+ if line[1:] == models.LFSMetaFileIdentifier {
+ curFileLFSPrefix = true
+ } else if curFileLFSPrefix && strings.HasPrefix(line[1:], models.LFSMetaFileOidPrefix) {
+ oid := strings.TrimPrefix(line[1:], models.LFSMetaFileOidPrefix)
+ if len(oid) == 64 {
+ m := &models.LFSMetaObject{Oid: oid}
+ count, err := models.Count(m)
- switch {
- case strings.HasPrefix(line, "copy from "):
- curFile.IsRenamed = true
- curFile.Type = DiffFileCopy
- case strings.HasPrefix(line, "copy to "):
- curFile.IsRenamed = true
- curFile.Type = DiffFileCopy
- case strings.HasPrefix(line, "new file"):
- curFile.Type = DiffFileAdd
- curFile.IsCreated = true
- case strings.HasPrefix(line, "deleted"):
- curFile.Type = DiffFileDel
- curFile.IsDeleted = true
- case strings.HasPrefix(line, "index"):
- curFile.Type = DiffFileChange
- case strings.HasPrefix(line, "similarity index 100%"):
- curFile.Type = DiffFileRename
- }
- if curFile.Type > 0 {
- if strings.HasSuffix(line, " 160000\n") {
- curFile.IsSubmodule = true
- }
- break
+ if err == nil && count > 0 {
+ curFile.IsBin = true
+ curFile.IsLFSFile = true
+ curSection.Lines = nil
}
}
}
}
+}
- // FIXME: detect encoding while parsing.
- var buf bytes.Buffer
- for _, f := range diff.Files {
- buf.Reset()
- for _, sec := range f.Sections {
- for _, l := range sec.Lines {
- buf.WriteString(l.Content)
- buf.WriteString("\n")
- }
- }
- charsetLabel, err := charset.DetectEncoding(buf.Bytes())
- if charsetLabel != "UTF-8" && err == nil {
- encoding, _ := stdcharset.Lookup(charsetLabel)
- if encoding != nil {
- d := encoding.NewDecoder()
- for _, sec := range f.Sections {
- for _, l := range sec.Lines {
- if c, _, err := transform.String(d, l.Content); err == nil {
- l.Content = c
- }
- }
- }
- }
+func createDiffFile(diff *Diff, line string) *DiffFile {
+ // The a/ and b/ filenames are the same unless rename/copy is involved.
+ // Especially, even for a creation or a deletion, /dev/null is not used
+ // in place of the a/ or b/ filenames.
+ //
+ // When rename/copy is involved, file1 and file2 show the name of the
+ // source file of the rename/copy and the name of the file that rename/copy
+ // produces, respectively.
+ //
+ // Path names are quoted if necessary.
+ //
+ // This means that you should always be able to determine the file name even when there
+ // there is potential ambiguity...
+ //
+ // but we can be simpler with our heuristics by just forcing git to prefix things nicely
+ curFile := &DiffFile{
+ Index: len(diff.Files) + 1,
+ Type: DiffFileChange,
+ Sections: make([]*DiffSection, 0, 10),
+ }
+
+ rd := strings.NewReader(line[len(cmdDiffHead):] + " ")
+ curFile.Type = DiffFileChange
+ curFile.OldName = readFileName(rd)
+ curFile.Name = readFileName(rd)
+ curFile.IsRenamed = curFile.Name != curFile.OldName
+ return curFile
+}
+
+func readFileName(rd *strings.Reader) string {
+ var name string
+ char, _ := rd.ReadByte()
+ _ = rd.UnreadByte()
+ if char == '"' {
+ fmt.Fscanf(rd, "%q ", &name)
+ if name[0] == '\\' {
+ name = name[1:]
}
+ } else {
+ fmt.Fscanf(rd, "%s ", &name)
}
- diff.NumFiles = len(diff.Files)
- return diff, nil
+ return name[2:]
}
// GetDiffRange builds a Diff between two commits of a repository.