diff options
author | zeripath <art27@cantab.net> | 2021-02-17 19:32:47 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-17 14:32:47 -0500 |
commit | 7ba158183a34d71b3989512c059a01d35c4c4673 (patch) | |
tree | c9ae0a2d4e621c6322701b322b128dfb25fe4d69 /modules | |
parent | fe628d8406632e6f5b3969ac86d817a035d4bae9 (diff) | |
download | gitea-7ba158183a34d71b3989512c059a01d35c4c4673.tar.gz gitea-7ba158183a34d71b3989512c059a01d35c4c4673.zip |
Use cat-file --batch in GetLanguageStats (#14685)
* Use cat-file --batch in GetLanguageStats
This PR moves to using a single cat-file --batch in GetLanguageStats
significantly reducing the number of processes spawned during language stat
processing.
Signed-off-by: Andrew Thornton <art27@cantab.net>
* placate lint
Signed-off-by: Andrew Thornton <art27@cantab.net>
* Update modules/git/repo_language_stats_nogogit.go
Co-authored-by: a1012112796 <1012112796@qq.com>
Co-authored-by: Lauris BH <lauris@nix.lv>
Co-authored-by: 6543 <6543@obermui.de>
Co-authored-by: a1012112796 <1012112796@qq.com>
Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com>
Diffstat (limited to 'modules')
-rw-r--r-- | modules/git/repo_language_stats_nogogit.go | 130 |
1 files changed, 100 insertions, 30 deletions
diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index 5607e4591a..4c6f07f0fb 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -7,9 +7,11 @@ package git import ( + "bufio" "bytes" "io" - "io/ioutil" + "math" + "strings" "code.gitea.io/gitea/modules/analyze" @@ -18,16 +20,60 @@ import ( // GetLanguageStats calculates language stats for git repository at specified commit func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { - // FIXME: We can be more efficient here... - // - // We're expecting that we will be reading a lot of blobs and the trees - // Thus we should use a shared `cat-file --batch` to get all of this data - // And keep the buffers around with resets as necessary. - // - // It's more complicated so... - commit, err := repo.GetCommit(commitID) + // We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary. + // so let's create a batch stdin and stdout + + batchStdinReader, batchStdinWriter := io.Pipe() + batchStdoutReader, batchStdoutWriter := io.Pipe() + defer func() { + _ = batchStdinReader.Close() + _ = batchStdinWriter.Close() + _ = batchStdoutReader.Close() + _ = batchStdoutWriter.Close() + }() + + go func() { + stderr := strings.Builder{} + err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repo.Path, batchStdoutWriter, &stderr, batchStdinReader) + if err != nil { + _ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String())) + _ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String())) + } else { + _ = batchStdoutWriter.Close() + _ = batchStdinReader.Close() + } + }() + + // For simplicities sake we'll us a buffered reader + batchReader := bufio.NewReader(batchStdoutReader) + + writeID := func(id string) error { + _, err := batchStdinWriter.Write([]byte(id)) + if err != nil { + return err + } + _, err = batchStdinWriter.Write([]byte{'\n'}) + return err + } + + if err := writeID(commitID); err != nil { + return nil, err + } + shaBytes, typ, size, err := ReadBatchLine(batchReader) + if typ != "commit" { + log("Unable to get commit for: %s. Err: %v", commitID, err) + return nil, ErrNotExist{commitID, ""} + } + + sha, err := NewIDFromString(string(shaBytes)) if err != nil { - log("Unable to get commit for: %s", commitID) + log("Unable to get commit for: %s. Err: %v", commitID, err) + return nil, ErrNotExist{commitID, ""} + } + + commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size)) + if err != nil { + log("Unable to get commit for: %s. Err: %v", commitID, err) return nil, err } @@ -38,17 +84,45 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } + contentBuf := bytes.Buffer{} + var content []byte sizes := make(map[string]int64) for _, f := range entries { + contentBuf.Reset() + content = contentBuf.Bytes() if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) || enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) { continue } // If content can not be read or file is too big just do detection by filename - var content []byte + if f.Size() <= bigFileSize { - content, _ = readFile(f, fileSizeLimit) + if err := writeID(f.ID.String()); err != nil { + return nil, err + } + _, _, size, err := ReadBatchLine(batchReader) + if err != nil { + log("Error reading blob: %s Err: %v", f.ID.String(), err) + return nil, err + } + + sizeToRead := size + discard := int64(0) + if size > fileSizeLimit { + sizeToRead = fileSizeLimit + discard = size - fileSizeLimit + } + + _, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead)) + if err != nil { + return nil, err + } + content = contentBuf.Bytes() + err = discardFull(batchReader, discard) + if err != nil { + return nil, err + } } if enry.IsGenerated(f.Name(), content) { continue @@ -86,24 +160,20 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return sizes, nil } -func readFile(entry *TreeEntry, limit int64) ([]byte, error) { - // FIXME: We can probably be a little more efficient here... see above - r, err := entry.Blob().DataAsync() - if err != nil { - return nil, err - } - defer r.Close() - - if limit <= 0 { - return ioutil.ReadAll(r) +func discardFull(rd *bufio.Reader, discard int64) error { + if discard > math.MaxInt32 { + n, err := rd.Discard(math.MaxInt32) + discard -= int64(n) + if err != nil { + return err + } } - - size := entry.Size() - if limit > 0 && size > limit { - size = limit + for discard > 0 { + n, err := rd.Discard(int(discard)) + discard -= int64(n) + if err != nil { + return err + } } - buf := bytes.NewBuffer(nil) - buf.Grow(int(size)) - _, err = io.Copy(buf, io.LimitReader(r, limit)) - return buf.Bytes(), err + return nil } |