aboutsummaryrefslogtreecommitdiffstats
path: root/modules
diff options
context:
space:
mode:
authorzeripath <art27@cantab.net>2021-02-17 19:32:47 +0000
committerGitHub <noreply@github.com>2021-02-17 14:32:47 -0500
commit7ba158183a34d71b3989512c059a01d35c4c4673 (patch)
treec9ae0a2d4e621c6322701b322b128dfb25fe4d69 /modules
parentfe628d8406632e6f5b3969ac86d817a035d4bae9 (diff)
downloadgitea-7ba158183a34d71b3989512c059a01d35c4c4673.tar.gz
gitea-7ba158183a34d71b3989512c059a01d35c4c4673.zip
Use cat-file --batch in GetLanguageStats (#14685)
* Use cat-file --batch in GetLanguageStats This PR moves to using a single cat-file --batch in GetLanguageStats significantly reducing the number of processes spawned during language stat processing. Signed-off-by: Andrew Thornton <art27@cantab.net> * placate lint Signed-off-by: Andrew Thornton <art27@cantab.net> * Update modules/git/repo_language_stats_nogogit.go Co-authored-by: a1012112796 <1012112796@qq.com> Co-authored-by: Lauris BH <lauris@nix.lv> Co-authored-by: 6543 <6543@obermui.de> Co-authored-by: a1012112796 <1012112796@qq.com> Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com>
Diffstat (limited to 'modules')
-rw-r--r--modules/git/repo_language_stats_nogogit.go130
1 files changed, 100 insertions, 30 deletions
diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go
index 5607e4591a..4c6f07f0fb 100644
--- a/modules/git/repo_language_stats_nogogit.go
+++ b/modules/git/repo_language_stats_nogogit.go
@@ -7,9 +7,11 @@
package git
import (
+ "bufio"
"bytes"
"io"
- "io/ioutil"
+ "math"
+ "strings"
"code.gitea.io/gitea/modules/analyze"
@@ -18,16 +20,60 @@ import (
// GetLanguageStats calculates language stats for git repository at specified commit
func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
- // FIXME: We can be more efficient here...
- //
- // We're expecting that we will be reading a lot of blobs and the trees
- // Thus we should use a shared `cat-file --batch` to get all of this data
- // And keep the buffers around with resets as necessary.
- //
- // It's more complicated so...
- commit, err := repo.GetCommit(commitID)
+ // We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
+ // so let's create a batch stdin and stdout
+
+ batchStdinReader, batchStdinWriter := io.Pipe()
+ batchStdoutReader, batchStdoutWriter := io.Pipe()
+ defer func() {
+ _ = batchStdinReader.Close()
+ _ = batchStdinWriter.Close()
+ _ = batchStdoutReader.Close()
+ _ = batchStdoutWriter.Close()
+ }()
+
+ go func() {
+ stderr := strings.Builder{}
+ err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repo.Path, batchStdoutWriter, &stderr, batchStdinReader)
+ if err != nil {
+ _ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String()))
+ _ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String()))
+ } else {
+ _ = batchStdoutWriter.Close()
+ _ = batchStdinReader.Close()
+ }
+ }()
+
+ // For simplicities sake we'll us a buffered reader
+ batchReader := bufio.NewReader(batchStdoutReader)
+
+ writeID := func(id string) error {
+ _, err := batchStdinWriter.Write([]byte(id))
+ if err != nil {
+ return err
+ }
+ _, err = batchStdinWriter.Write([]byte{'\n'})
+ return err
+ }
+
+ if err := writeID(commitID); err != nil {
+ return nil, err
+ }
+ shaBytes, typ, size, err := ReadBatchLine(batchReader)
+ if typ != "commit" {
+ log("Unable to get commit for: %s. Err: %v", commitID, err)
+ return nil, ErrNotExist{commitID, ""}
+ }
+
+ sha, err := NewIDFromString(string(shaBytes))
if err != nil {
- log("Unable to get commit for: %s", commitID)
+ log("Unable to get commit for: %s. Err: %v", commitID, err)
+ return nil, ErrNotExist{commitID, ""}
+ }
+
+ commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size))
+ if err != nil {
+ log("Unable to get commit for: %s. Err: %v", commitID, err)
return nil, err
}
@@ -38,17 +84,45 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
return nil, err
}
+ contentBuf := bytes.Buffer{}
+ var content []byte
sizes := make(map[string]int64)
for _, f := range entries {
+ contentBuf.Reset()
+ content = contentBuf.Bytes()
if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) ||
enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
continue
}
// If content can not be read or file is too big just do detection by filename
- var content []byte
+
if f.Size() <= bigFileSize {
- content, _ = readFile(f, fileSizeLimit)
+ if err := writeID(f.ID.String()); err != nil {
+ return nil, err
+ }
+ _, _, size, err := ReadBatchLine(batchReader)
+ if err != nil {
+ log("Error reading blob: %s Err: %v", f.ID.String(), err)
+ return nil, err
+ }
+
+ sizeToRead := size
+ discard := int64(0)
+ if size > fileSizeLimit {
+ sizeToRead = fileSizeLimit
+ discard = size - fileSizeLimit
+ }
+
+ _, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
+ if err != nil {
+ return nil, err
+ }
+ content = contentBuf.Bytes()
+ err = discardFull(batchReader, discard)
+ if err != nil {
+ return nil, err
+ }
}
if enry.IsGenerated(f.Name(), content) {
continue
@@ -86,24 +160,20 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
return sizes, nil
}
-func readFile(entry *TreeEntry, limit int64) ([]byte, error) {
- // FIXME: We can probably be a little more efficient here... see above
- r, err := entry.Blob().DataAsync()
- if err != nil {
- return nil, err
- }
- defer r.Close()
-
- if limit <= 0 {
- return ioutil.ReadAll(r)
+func discardFull(rd *bufio.Reader, discard int64) error {
+ if discard > math.MaxInt32 {
+ n, err := rd.Discard(math.MaxInt32)
+ discard -= int64(n)
+ if err != nil {
+ return err
+ }
}
-
- size := entry.Size()
- if limit > 0 && size > limit {
- size = limit
+ for discard > 0 {
+ n, err := rd.Discard(int(discard))
+ discard -= int64(n)
+ if err != nil {
+ return err
+ }
}
- buf := bytes.NewBuffer(nil)
- buf.Grow(int(size))
- _, err = io.Copy(buf, io.LimitReader(r, limit))
- return buf.Bytes(), err
+ return nil
}