.Builder{} // OK let's set a reasonable buffer size. // This should be let's say at least the size of maxLineCharacters or 4096 whichever is larger. readerSize := maxLineCharacters if readerSize < 4096 { readerSize = 4096 } input := bufio.NewReaderSize(reader, readerSize) line, err := input.ReadString('\n') if err != nil { if err == io.EOF { return diff, nil } return diff, err } parsingLoop: for { // 1. A patch file always begins with `diff --git ` + `a/path b/path` (possibly quoted) // if it does not we have bad input! if !strings.HasPrefix(line, cmdDiffHead) { return diff, fmt.Errorf("Invalid first file line: %s", line) } // TODO: Handle skipping first n files if len(diff.Files) >= maxFiles { diff.IsIncomplete = true _, err := io.Copy(io.Discard, reader) if err != nil { // By the definition of io.Copy this never returns io.EOF return diff, fmt.Errorf("Copy: %v", err) } break parsingLoop } curFile = createDiffFile(diff, line) diff.Files = append(diff.Files, curFile) // 2. It is followed by one or more extended header lines: // // old mode <mode> // new mode <mode> // deleted file mode <mode> // new file mode <mode> // copy from <path> // copy to <path> // rename from <path> // rename to <path> // similarity index <number> // dissimilarity index <number> // index <hash>..<hash> <mode> // // * <mode> 6-digit octal numbers including the file type and file permission bits. // * <path> does not include the a/ and b/ prefixes // * <number> percentage of unchanged lines for similarity, percentage of changed // lines dissimilarity as integer rounded down with terminal %. 100% => equal files. // * The index line includes the blob object names before and after the change. // The <mode> is included if the file mode does not change; otherwise, separate // lines indicate the old and the new mode. // 3. Following this header the "standard unified" diff format header may be encountered: (but not for every case...) // // --- a/<path> // +++ b/<path> // // With multiple hunks // // @@ <hunk descriptor> @@ // +added line // -removed line // unchanged line // // 4. Binary files get: // // Binary files a/<path> and b/<path> differ // // but one of a/<path> and b/<path> could be /dev/null. curFileLoop: for { line, err = input.ReadString('\n') if err != nil { if err != io.EOF { return diff, err } break parsingLoop } switch { case strings.HasPrefix(line, cmdDiffHead): break curFileLoop case strings.HasPrefix(line, "old mode ") || strings.HasPrefix(line, "new mode "): if strings.HasSuffix(line, " 160000\n") { curFile.IsSubmodule = true } case strings.HasPrefix(line, "rename from "): curFile.IsRenamed = true curFile.Type = DiffFileRename if curFile.IsAmbiguous { curFile.OldName = line[len("rename from ") : len(line)-1] } case strings.HasPrefix(line, "rename to "): curFile.IsRenamed = true curFile.Type = DiffFileRename if curFile.IsAmbiguous { curFile.Name = line[len("rename to ") : len(line)-1] curFile.IsAmbiguous = false } case strings.HasPrefix(line, "copy from "): curFile.IsRenamed = true curFile.Type = DiffFileCopy if curFile.IsAmbiguous { curFile.OldName = line[len("copy from ") : len(line)-1] } case strings.HasPrefix(line, "copy to "): curFile.IsRenamed = true curFile.Type = DiffFileCopy if curFile.IsAmbiguous { curFile.Name = line[len("copy to ") : len(line)-1] curFile.IsAmbiguous = false } case strings.HasPrefix(line, "new file"): curFile.Type = DiffFileAdd curFile.IsCreated = true if strings.HasSuffix(line, " 160000\n") { curFile.IsSubmodule = true } case strings.HasPrefix(line, "deleted"): curFile.Type = DiffFileDel curFile.IsDeleted = true if strings.HasSuffix(line, " 160000\n") { curFile.IsSubmodule = true } case strings.HasPrefix(line, "index"): if strings.HasSuffix(line, " 160000\n") { curFile.IsSubmodule = true } case strings.HasPrefix(line, "similarity index 100%"): curFile.Type = DiffFileRename case strings.HasPrefix(line, "Binary"): curFile.IsBin = true case strings.HasPrefix(line, "--- "): // Handle ambiguous filenames if curFile.IsAmbiguous { if len(line) > 6 && line[4] == 'a' { curFile.OldName = line[6 : len(line)-1] if line[len(line)-2] == '\t' { curFile.OldName = curFile.OldName[:len(curFile.OldName)-1] } } else { curFile.OldName = "" } } // Otherwise do nothing with this line case strings.HasPrefix(line, "+++ "): // Handle ambiguous filenames if curFile.IsAmbiguous { if len(line) > 6 && line[4] == 'b' { curFile.Name = line[6 : len(line)-1] if line[len(line)-2] == '\t' { curFile.Name = curFile.Name[:len(curFile.Name)-1] } if curFile.OldName == "" { curFile.OldName = curFile.Name } } else { curFile.Name = curFile.OldName } curFile.IsAmbiguous = false } // Otherwise do nothing with this line, but now switch to parsing hunks lineBytes, isFragment, err := parseHunks(curFile, maxLines, maxLineCharacters, input) diff.TotalAddition += curFile.Addition diff.TotalDeletion += curFile.Deletion if err != nil { if err != io.EOF { return diff, err } break parsingLoop } sb.Reset() _, _ = sb.Write(lineBytes) for isFragment { lineBytes, isFragment, err = input.ReadLine() if err != nil { // Now by the definition of ReadLine this cannot be io.EOF return diff, fmt.Errorf("Unable to ReadLine: %v", err) } _, _ = sb.Write(lineBytes) } line = sb.String() sb.Reset() break curFileLoop } } } // TODO: There are numerous issues with this: // - we might want to consider detecting encoding while parsing but... // - we're likely to fail to get the correct encoding here anyway as we won't have enough information var diffLineTypeBuffers = make(map[DiffLineType]*bytes.Buffer, 3) var diffLineTypeDecoders = make(map[DiffLineType]*encoding.Decoder, 3) diffLineTypeBuffers[DiffLinePlain] = new(bytes.Buffer) diffLineTypeBuffers[DiffLineAdd] = new(bytes.Buffer) diffLineTypeBuffers[DiffLineDel] = new(bytes.Buffer) for _, f := range diff.Files { for _, buffer := range diffLineTypeBuffers { buffer.Reset() } for _, sec := range f.Sections { for _, l := range sec.Lines { if l.Type == DiffLineSection { continue } diffLineTypeBuffers[l.Type].WriteString(l.Content[1:]) diffLineTypeBuffers[l.Type].WriteString("\n") } } for lineType, buffer := range diffLineTypeBuffers { diffLineTypeDecoders[lineType] = nil if buffer.Len() == 0 { continue } charsetLabel, err := charset.DetectEncoding(buffer.Bytes()) if charsetLabel != "UTF-8" && err == nil { encoding, _ := stdcharset.Lookup(charsetLabel) if encoding != nil { diffLineTypeDecoders[lineType] = encoding.NewDecoder() } } } for _, sec := range f.Sections { for _, l := range sec.Lines { decoder := diffLineTypeDecoders[l.Type] if decoder != nil { if c, _, err := transform.String(decoder, l.Content[1:]); err == nil { l.Content = l.Content[0:1] + c } } } } } diff.NumFiles = len(diff.Files) return diff, nil } func parseHunks(curFile *DiffFile, maxLines, maxLineCharacters int, input *bufio.Reader) (lineBytes []byte, isFragment bool, err error) { sb := strings.Builder{} var ( curSection *DiffSection curFileLinesCount int curFileLFSPrefix bool ) lastLeftIdx := -1 leftLine, rightLine := 1, 1 for { for isFragment { curFile.IsIncomplete = true curFile.IsIncompleteLineTooLong = true _, isFragment, err = input.ReadLine() if err != nil { // Now by the definition of ReadLine this cannot be io.EOF err = fmt.Errorf("Unable to ReadLine: %v", err) return } } sb.Reset() lineBytes, isFragment, err = input.ReadLine() if err != nil { if err == io.EOF { return } err = fmt.Errorf("Unable to ReadLine: %v", err) return } if lineBytes[0] == 'd' { // End of hunks return } switch lineBytes[0] { case '@': if curFileLinesCount >= maxLines { curFile.IsIncomplete = true continue } _, _ = sb.Write(lineBytes) for isFragment { // This is very odd indeed - we're in a section header and the line is too long // This really shouldn't happen... lineBytes, isFragment, err = input.ReadLine() if err != nil { // Now by the definition of ReadLine this cannot be io.EOF err = fmt.Errorf("Unable to ReadLine: %v", err) return } _, _ = sb.Write(lineBytes) } line := sb.String() // Create a new section to represent this hunk curSection = &DiffSection{} lastLeftIdx = -1 curFile.Sections = append(curFile.Sections, curSection) lineSectionInfo := getDiffLineSectionInfo(curFile.Name, line, leftLine-1, rightLine-1) diffLine := &DiffLine{ Type: DiffLineSection, Content: line, SectionInfo: lineSectionInfo, } curSection.Lines = append(curSection.Lines, diffLine) curSection.FileName = curFile.Name // update line number. leftLine = lineSectionInfo.LeftIdx rightLine = lineSectionInfo.RightIdx continue case '\\': if curFileLinesCount >= maxLines { curFile.IsIncomplete = true continue } // This is used only to indicate that the current file does not have a terminal newline if !bytes.Equal(lineBytes, []byte("\\ No newline at end of file")) { err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes)) return } // Technically this should be the end the file! // FIXME: we should be putting a marker at the end of the file if there is no terminal new line continue case '+': curFileLinesCount++ curFile.Addition++ if curFileLinesCount >= maxLines { curFile.IsIncomplete = true continue } diffLine := &DiffLine{Type: DiffLineAdd, RightIdx: rightLine, Match: -1} rightLine++ if curSection == nil { // Create a new section to represent this hunk curSection = &DiffSection{} curFile.Sections = append(curFile.Sections, curSection) lastLeftIdx = -1 } if lastLeftIdx > -1 { diffLine.Match = lastLeftIdx curSection.Lines[lastLeftIdx].Match = len(curSection.Lines) lastLeftIdx++ if lastLeftIdx >= len(curSection.Lines) || curSection.Lines[lastLeftIdx].Type != DiffLineDel { lastLeftIdx = -1 } } curSection.Lines = append(curSection.Lines, diffLine) case '-': curFileLinesCount++ curFile.Deletion++ if curFileLinesCount >= maxLines { curFile.IsIncomplete = true continue } diffLine := &DiffLine{Type: DiffLineDel, LeftIdx: leftLine, Match: -1} if leftLine > 0 { leftLine++ } if curSection == nil { // Create a new section to represent this hunk curSection = &DiffSection{} curFile.Sections = append(curFile.Sections, curSection) lastLeftIdx = -1 } if len(curSection.Lines) == 0 || curSection.Lines[len(curSection.Lines)-1].Type != DiffLineDel { lastLeftIdx = len(curSection.Lines) } curSection.Lines = append(curSection.Lines, diffLine) case ' ': curFileLinesCount++ if curFileLinesCount >= maxLines { curFile.IsIncomplete = true continue } diffLine := &DiffLine{Type: DiffLinePlain, LeftIdx: leftLine, RightIdx: rightLine} leftLine++ rightLine++ lastLeftIdx = -1 if curSection == nil { // Create a new section to represent this hunk curSection = &DiffSection{} curFile.Sections = append(curFile.Sections, curSection) } curSection.Lines = append(curSection.Lines, diffLine) default: // This is unexpected err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes)) return } line := string(lineBytes) if isFragment { curFile.IsIncomplete = true curFile.IsIncompleteLineTooLong = true for isFragment { lineBytes, isFragment, err = input.ReadLine() if err != nil { // Now by the definition of ReadLine this cannot be io.EOF err = fmt.Errorf("Unable to ReadLine: %v", err) return } } } if len(line) > maxLineCharacters { curFile.IsIncomplete = true curFile.IsIncompleteLineTooLong = true line = line[:maxLineCharacters] } curSection.Lines[len(curSection.Lines)-1].Content = line // handle LFS if line[1:] == lfs.MetaFileIdentifier { curFileLFSPrefix = true } else if curFileLFSPrefix && strings.HasPrefix(line[1:], lfs.MetaFileOidPrefix) { oid := strings.TrimPrefix(line[1:], lfs.MetaFileOidPrefix) if len(oid) == 64 { m := &models.LFSMetaObject{Pointer: lfs.Pointer{Oid: oid}} count, err := db.Count(m) if err == nil && count > 0 { curFile.IsBin = true curFile.IsLFSFile = true curSection.Lines = nil lastLeftIdx = -1 } } } } } func createDiffFile(diff *Diff, line string) *DiffFile { // The a/ and b/ filenames are the same unless rename/copy is involved. // Especially, even for a creation or a deletion, /dev/null is not used // in place of the a/ or b/ filenames. // // When rename/copy is involved, file1 and file2 show the name of the // source file of the rename/copy and the name of the file that rename/copy // produces, respectively. // // Path names are quoted if necessary. // // This means that you should always be able to determine the file name even when there // there is potential ambiguity... // // but we can be simpler with our heuristics by just forcing git to prefix things nicely curFile := &DiffFile{ Index: len(diff.Files) + 1, Type: DiffFileChange, Sections: make([]*DiffSection, 0, 10), } rd := strings.NewReader(line[len(cmdDiffHead):] + " ") curFile.Type = DiffFileChange oldNameAmbiguity := false newNameAmbiguity := false curFile.OldName, oldNameAmbiguity = readFileName(rd) curFile.Name, newNameAmbiguity = readFileName(rd) if oldNameAmbiguity && newNameAmbiguity { curFile.IsAmbiguous = true // OK we should bet that the oldName and the newName are the same if they can be made to be same // So we need to start again ... if (len(line)-len(cmdDiffHead)-1)%2 == 0 { // diff --git a/b b/b b/b b/b b/b b/b // midpoint := (len(line) + len(cmdDiffHead) - 1) / 2 new, old := line[len(cmdDiffHead):midpoint], line[midpoint+1:] if len(new) > 2 && len(old) > 2 && new[2:] == old[2:] { curFile.OldName = old[2:] curFile.Name = old[2:] } } } curFile.IsRenamed = curFile.Name != curFile.OldName return curFile } func readFileName(rd *strings.Reader) (string, bool) { ambiguity := false var name string char, _ := rd.ReadByte() _ = rd.UnreadByte() if char == '"' { fmt.Fscanf(rd, "%q ", &name) if name[0] == '\\' { name = name[1:] } } else { // This technique is potentially ambiguous it may not be possible to uniquely identify the filenames from the diff line alone ambiguity = true fmt.Fscanf(rd, "%s ", &name) char, _ := rd.ReadByte() _ = rd.UnreadByte() for !(char == 0 || char == '"' || char == 'b') { var suffix string fmt.Fscanf(rd, "%s ", &suffix) name += " " + suffix char, _ = rd.ReadByte() _ = rd.UnreadByte() } } if len(name) < 2 { log.Error("Unable to determine name from reader: %v", rd) return "", true } return name[2:], ambiguity } // GetDiffRangeWithWhitespaceBehavior builds a Diff between two commits of a repository. // Passing the empty string as beforeCommitID returns a diff from the parent commit. // The whitespaceBehavior is either an empty string or a git flag func GetDiffRangeWithWhitespaceBehavior(gitRepo *git.Repository, beforeCommitID, afterCommitID string, maxLines, maxLineCharacters, maxFiles int, whitespaceBehavior string, directComparison bool) (*Diff, error) { repoPath := gitRepo.Path commit, err := gitRepo.GetCommit(afterCommitID) if err != nil { return nil, err } ctx, cancel := context.WithTimeout(git.DefaultContext, time.Duration(setting.Git.Timeout.Default)*time.Second) defer cancel() var cmd *exec.Cmd if (len(beforeCommitID) == 0 || beforeCommitID == git.EmptySHA) && commit.ParentCount() == 0 { diffArgs := []string{"diff", "--src-prefix=\\a/", "--dst-prefix=\\b/", "-M"} if len(whitespaceBehavior) != 0 { diffArgs = append(diffArgs, whitespaceBehavior) } // append empty tree ref diffArgs = append(diffArgs, "4b825dc642cb6eb9a060e54bf8d69288fbee4904") diffArgs = append(diffArgs, afterCommitID) cmd = exec.CommandContext(ctx, git.GitExecutable, diffArgs...) } else { actualBeforeCommitID := beforeCommitID if len(actualBeforeCommitID) == 0 { parentCommit, _ := commit.Parent(0) actualBeforeCommitID = parentCommit.ID.String() } diffArgs := []string{"diff", "--src-prefix=\\a/", "--dst-prefix=\\b/", "-M"} if len(whitespaceBehavior) != 0 { diffArgs = append(diffArgs, whitespaceBehavior) } diffArgs = append(diffArgs, actualBeforeCommitID) diffArgs = append(diffArgs, afterCommitID) cmd = exec.CommandContext(ctx, git.GitExecutable, diffArgs...) beforeCommitID = actualBeforeCommitID } cmd.Dir = repoPath cmd.Stderr = os.Stderr stdout, err := cmd.StdoutPipe() if err != nil { return nil, fmt.Errorf("StdoutPipe: %v", err) } if err = cmd.Start(); err != nil { return nil, fmt.Errorf("Start: %v", err) } pid := process.GetManager().Add(fmt.Sprintf("GetDiffRange [repo_path: %s]", repoPath), cancel) defer process.GetManager().Remove(pid) diff, err := ParsePatch(maxLines, maxLineCharacters, maxFiles, stdout) if err != nil { return nil, fmt.Errorf("ParsePatch: %v", err) } var checker *git.CheckAttributeReader if git.CheckGitVersionAtLeast("1.7.8") == nil { indexFilename, deleteTemporaryFile, err := gitRepo.ReadTreeToTemporaryIndex(afterCommitID) if err == nil { defer deleteTemporaryFile() workdir, err := os.MkdirTemp("", "empty-work-dir") if err != nil { log.Error("Unable to create temporary directory: %v", err) return nil, err } defer func() { _ = util.RemoveAll(workdir) }() checker = &git.CheckAttributeReader{ Attributes: []string{"linguist-vendored", "linguist-generated"}, Repo: gitRepo, IndexFile: indexFilename, WorkTree: workdir, } ctx, cancel := context.WithCancel(git.DefaultContext) if err := checker.Init(ctx); err != nil { log.Error("Unable to open checker for %s. Error: %v", afterCommitID, err) } else { go func() { err = checker.Run() if err != nil && err != ctx.Err() { log.Error("Unable to open checker for %s. Error: %v", afterCommitID, err) } cancel() }() } defer func() { cancel() }() } } for _, diffFile := range diff.Files { gotVendor := false gotGenerated := false if checker != nil { attrs, err := checker.CheckPath(diffFile.Name) if err == nil { if vendored, has := attrs["linguist-vendored"]; has { if vendored == "set" || vendored == "true" { diffFile.IsVendored = true gotVendor = true } else { gotVendor = vendored == "false" } } if generated, has := attrs["linguist-generated"]; has { if generated == "set" || generated == "true" { diffFile.IsGenerated = true gotGenerated = true } else { gotGenerated = generated == "false" } } } else { log.Error("Unexpected error: %v", err) } } if !gotVendor { diffFile.IsVendored = analyze.IsVendor(diffFile.Name) } if !gotGenerated { diffFile.IsGenerated = analyze.IsGenerated(diffFile.Name) } tailSection := diffFile.GetTailSection(gitRepo, beforeCommitID, afterCommitID) if tailSection != nil { diffFile.Sections = append(diffFile.Sections, tailSection) } } if err = cmd.Wait(); err != nil { return nil, fmt.Errorf("Wait: %v", err) } separator := "..." if directComparison { separator = ".." } shortstatArgs := []string{beforeCommitID + separator + afterCommitID} if len(beforeCommitID) == 0 || beforeCommitID == git.EmptySHA { shortstatArgs = []string{git.EmptyTreeSHA, afterCommitID} } diff.NumFiles, diff.TotalAddition, diff.TotalDeletion, err = git.GetDiffShortStat(repoPath, shortstatArgs...) if err != nil && strings.Contains(err.Error(), "no merge base") { // git >= 2.28 now returns an error if base and head have become unrelated. // previously it would return the results of git diff --shortstat base head so let's try that... shortstatArgs = []string{beforeCommitID, afterCommitID} diff.NumFiles, diff.TotalAddition, diff.TotalDeletion, err = git.GetDiffShortStat(repoPath, shortstatArgs...) } if err != nil { return nil, err } return diff, nil } // GetDiffCommitWithWhitespaceBehavior builds a Diff representing the given commitID. // The whitespaceBehavior is either an empty string or a git flag func GetDiffCommitWithWhitespaceBehavior(gitRepo *git.Repository, commitID string, maxLines, maxLineCharacters, maxFiles int, whitespaceBehavior string, directComparison bool) (*Diff, error) { return GetDiffRangeWithWhitespaceBehavior(gitRepo, "", commitID, maxLines, maxLineCharacters, maxFiles, whitespaceBehavior, directComparison) } // CommentAsDiff returns c.Patch as *Diff func CommentAsDiff(c *models.Comment) (*Diff, error) { diff, err := ParsePatch(setting.Git.MaxGitDiffLines, setting.Git.MaxGitDiffLineCharacters, setting.Git.MaxGitDiffFiles, strings.NewReader(c.Patch)) if err != nil { log.Error("Unable to parse patch: %v", err) return nil, err } if len(diff.Files) == 0 { return nil, fmt.Errorf("no file found for comment ID: %d", c.ID) } secs := diff.Files[0].Sections if len(secs) == 0 { return nil, fmt.Errorf("no sections found for comment ID: %d", c.ID) } return diff, nil } // CommentMustAsDiff executes AsDiff and logs the error instead of returning func CommentMustAsDiff(c *models.Comment) *Diff { if c == nil { return nil } defer func() { if err := recover(); err != nil { log.Error("PANIC whilst retrieving diff for comment[%d] Error: %v\nStack: %s", c.ID, err, log.Stack(2)) } }() diff, err := CommentAsDiff(c) if err != nil { log.Warn("CommentMustAsDiff: %v", err) } return diff } // GetWhitespaceFlag returns git diff flag for treating whitespaces func GetWhitespaceFlag(whiteSpaceBehavior string) string { whitespaceFlags := map[string]string{ "ignore-all": "-w", "ignore-change": "-b", "ignore-eol": "--ignore-space-at-eol", "": ""} return whitespaceFlags[whiteSpaceBehavior] }