diff options
Diffstat (limited to 'modules')
-rw-r--r-- | modules/analyze/generated.go | 28 | ||||
-rw-r--r-- | modules/git/repo_attribute.go | 285 | ||||
-rw-r--r-- | modules/git/repo_attribute_test.go | 159 | ||||
-rw-r--r-- | modules/git/repo_index.go | 39 | ||||
-rw-r--r-- | modules/git/repo_language_stats_gogit.go | 70 | ||||
-rw-r--r-- | modules/git/repo_language_stats_nogogit.go | 71 |
6 files changed, 640 insertions, 12 deletions
diff --git a/modules/analyze/generated.go b/modules/analyze/generated.go new file mode 100644 index 0000000000..0f14d28545 --- /dev/null +++ b/modules/analyze/generated.go @@ -0,0 +1,28 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package analyze + +import ( + "path/filepath" + "strings" + + "github.com/go-enry/go-enry/v2/data" +) + +// IsGenerated returns whether or not path is a generated path. +func IsGenerated(path string) bool { + ext := strings.ToLower(filepath.Ext(path)) + if _, ok := data.GeneratedCodeExtensions[ext]; ok { + return true + } + + for _, m := range data.GeneratedCodeNameMatchers { + if m(path) { + return true + } + } + + return false +} diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go index aa5e4c10e7..0bd7d7e49c 100644 --- a/modules/git/repo_attribute.go +++ b/modules/git/repo_attribute.go @@ -6,7 +6,12 @@ package git import ( "bytes" + "context" "fmt" + "io" + "os" + "strconv" + "strings" ) // CheckAttributeOpts represents the possible options to CheckAttribute @@ -21,7 +26,7 @@ type CheckAttributeOpts struct { func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[string]string, error) { err := LoadGitVersion() if err != nil { - return nil, fmt.Errorf("Git version missing: %v", err) + return nil, fmt.Errorf("git version missing: %v", err) } stdOut := new(bytes.Buffer) @@ -55,13 +60,14 @@ func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[ cmd := NewCommand(cmdArgs...) if err := cmd.RunInDirPipeline(repo.Path, stdOut, stdErr); err != nil { - return nil, fmt.Errorf("Failed to run check-attr: %v\n%s\n%s", err, stdOut.String(), stdErr.String()) + return nil, fmt.Errorf("failed to run check-attr: %v\n%s\n%s", err, stdOut.String(), stdErr.String()) } + // FIXME: This is incorrect on versions < 1.8.5 fields := bytes.Split(stdOut.Bytes(), []byte{'\000'}) if len(fields)%3 != 1 { - return nil, fmt.Errorf("Wrong number of fields in return from check-attr") + return nil, fmt.Errorf("wrong number of fields in return from check-attr") } var name2attribute2info = make(map[string]map[string]string) @@ -80,3 +86,276 @@ func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[ return name2attribute2info, nil } + +// CheckAttributeReader provides a reader for check-attribute content that can be long running +type CheckAttributeReader struct { + // params + Attributes []string + Repo *Repository + IndexFile string + WorkTree string + + stdinReader io.ReadCloser + stdinWriter *os.File + stdOut attributeWriter + cmd *Command + env []string + ctx context.Context + cancel context.CancelFunc + running chan struct{} +} + +// Init initializes the cmd +func (c *CheckAttributeReader) Init(ctx context.Context) error { + c.running = make(chan struct{}) + cmdArgs := []string{"check-attr", "--stdin", "-z"} + + if len(c.IndexFile) > 0 && CheckGitVersionAtLeast("1.7.8") == nil { + cmdArgs = append(cmdArgs, "--cached") + c.env = []string{"GIT_INDEX_FILE=" + c.IndexFile} + } + + if len(c.WorkTree) > 0 && CheckGitVersionAtLeast("1.7.8") == nil { + c.env = []string{"GIT_WORK_TREE=" + c.WorkTree} + } + + if len(c.Attributes) > 0 { + cmdArgs = append(cmdArgs, c.Attributes...) + cmdArgs = append(cmdArgs, "--") + } else { + lw := new(nulSeparatedAttributeWriter) + lw.attributes = make(chan attributeTriple) + + c.stdOut = lw + c.stdOut.Close() + return fmt.Errorf("no provided Attributes to check") + } + + c.ctx, c.cancel = context.WithCancel(ctx) + c.cmd = NewCommandContext(c.ctx, cmdArgs...) + var err error + c.stdinReader, c.stdinWriter, err = os.Pipe() + if err != nil { + return err + } + + if CheckGitVersionAtLeast("1.8.5") == nil { + lw := new(nulSeparatedAttributeWriter) + lw.attributes = make(chan attributeTriple, 5) + + c.stdOut = lw + } else { + lw := new(lineSeparatedAttributeWriter) + lw.attributes = make(chan attributeTriple, 5) + + c.stdOut = lw + } + return nil +} + +// Run run cmd +func (c *CheckAttributeReader) Run() error { + stdErr := new(bytes.Buffer) + err := c.cmd.RunInDirTimeoutEnvFullPipelineFunc(c.env, -1, c.Repo.Path, c.stdOut, stdErr, c.stdinReader, func(_ context.Context, _ context.CancelFunc) error { + close(c.running) + return nil + }) + defer c.cancel() + _ = c.stdOut.Close() + if err != nil && c.ctx.Err() != nil && err.Error() != "signal: killed" { + return fmt.Errorf("failed to run attr-check. Error: %w\nStderr: %s", err, stdErr.String()) + } + + return nil +} + +// CheckPath check attr for given path +func (c *CheckAttributeReader) CheckPath(path string) (map[string]string, error) { + select { + case <-c.ctx.Done(): + return nil, c.ctx.Err() + case <-c.running: + } + + if _, err := c.stdinWriter.Write([]byte(path + "\x00")); err != nil { + defer c.cancel() + return nil, err + } + + if err := c.stdinWriter.Sync(); err != nil { + defer c.cancel() + return nil, err + } + + rs := make(map[string]string) + for range c.Attributes { + select { + case attr := <-c.stdOut.ReadAttribute(): + rs[attr.Attribute] = attr.Value + case <-c.ctx.Done(): + return nil, c.ctx.Err() + } + } + return rs, nil +} + +// Close close pip after use +func (c *CheckAttributeReader) Close() error { + select { + case <-c.running: + default: + close(c.running) + } + defer c.cancel() + return c.stdinWriter.Close() +} + +type attributeWriter interface { + io.WriteCloser + ReadAttribute() <-chan attributeTriple +} + +type attributeTriple struct { + Filename string + Attribute string + Value string +} + +type nulSeparatedAttributeWriter struct { + tmp []byte + attributes chan attributeTriple + working attributeTriple + pos int +} + +func (wr *nulSeparatedAttributeWriter) Write(p []byte) (n int, err error) { + l, read := len(p), 0 + + nulIdx := bytes.IndexByte(p, '\x00') + for nulIdx >= 0 { + wr.tmp = append(wr.tmp, p[:nulIdx]...) + switch wr.pos { + case 0: + wr.working = attributeTriple{ + Filename: string(wr.tmp), + } + case 1: + wr.working.Attribute = string(wr.tmp) + case 2: + wr.working.Value = string(wr.tmp) + } + wr.tmp = wr.tmp[:0] + wr.pos++ + if wr.pos > 2 { + wr.attributes <- wr.working + wr.pos = 0 + } + read += nulIdx + 1 + if l > read { + p = p[nulIdx+1:] + nulIdx = bytes.IndexByte(p, '\x00') + } else { + return l, nil + } + } + wr.tmp = append(wr.tmp, p...) + return len(p), nil +} + +func (wr *nulSeparatedAttributeWriter) ReadAttribute() <-chan attributeTriple { + return wr.attributes +} + +func (wr *nulSeparatedAttributeWriter) Close() error { + close(wr.attributes) + return nil +} + +type lineSeparatedAttributeWriter struct { + tmp []byte + attributes chan attributeTriple +} + +func (wr *lineSeparatedAttributeWriter) Write(p []byte) (n int, err error) { + l := len(p) + + nlIdx := bytes.IndexByte(p, '\n') + for nlIdx >= 0 { + wr.tmp = append(wr.tmp, p[:nlIdx]...) + + if len(wr.tmp) == 0 { + // This should not happen + if len(p) > nlIdx+1 { + wr.tmp = wr.tmp[:0] + p = p[nlIdx+1:] + nlIdx = bytes.IndexByte(p, '\n') + continue + } else { + return l, nil + } + } + + working := attributeTriple{} + if wr.tmp[0] == '"' { + sb := new(strings.Builder) + remaining := string(wr.tmp[1:]) + for len(remaining) > 0 { + rn, _, tail, err := strconv.UnquoteChar(remaining, '"') + if err != nil { + if len(remaining) > 2 && remaining[0] == '"' && remaining[1] == ':' && remaining[2] == ' ' { + working.Filename = sb.String() + wr.tmp = []byte(remaining[3:]) + break + } + return l, fmt.Errorf("unexpected tail %s", string(remaining)) + } + _, _ = sb.WriteRune(rn) + remaining = tail + } + } else { + idx := bytes.IndexByte(wr.tmp, ':') + if idx < 0 { + return l, fmt.Errorf("unexpected input %s", string(wr.tmp)) + } + working.Filename = string(wr.tmp[:idx]) + if len(wr.tmp) < idx+2 { + return l, fmt.Errorf("unexpected input %s", string(wr.tmp)) + } + wr.tmp = wr.tmp[idx+2:] + } + + idx := bytes.IndexByte(wr.tmp, ':') + if idx < 0 { + return l, fmt.Errorf("unexpected input %s", string(wr.tmp)) + } + + working.Attribute = string(wr.tmp[:idx]) + if len(wr.tmp) < idx+2 { + return l, fmt.Errorf("unexpected input %s", string(wr.tmp)) + } + + working.Value = string(wr.tmp[idx+2:]) + + wr.attributes <- working + wr.tmp = wr.tmp[:0] + if len(p) > nlIdx+1 { + p = p[nlIdx+1:] + nlIdx = bytes.IndexByte(p, '\n') + continue + } else { + return l, nil + } + } + + wr.tmp = append(wr.tmp, p...) + return l, nil +} + +func (wr *lineSeparatedAttributeWriter) ReadAttribute() <-chan attributeTriple { + return wr.attributes +} + +func (wr *lineSeparatedAttributeWriter) Close() error { + close(wr.attributes) + return nil +} diff --git a/modules/git/repo_attribute_test.go b/modules/git/repo_attribute_test.go new file mode 100644 index 0000000000..92d1a78fa4 --- /dev/null +++ b/modules/git/repo_attribute_test.go @@ -0,0 +1,159 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package git + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func Test_nulSeparatedAttributeWriter_ReadAttribute(t *testing.T) { + wr := &nulSeparatedAttributeWriter{ + attributes: make(chan attributeTriple, 5), + } + + testStr := ".gitignore\"\n\x00linguist-vendored\x00unspecified\x00" + + n, err := wr.Write([]byte(testStr)) + + assert.Equal(t, n, len(testStr)) + assert.NoError(t, err) + select { + case attr := <-wr.ReadAttribute(): + assert.Equal(t, ".gitignore\"\n", attr.Filename) + assert.Equal(t, "linguist-vendored", attr.Attribute) + assert.Equal(t, "unspecified", attr.Value) + case <-time.After(100 * time.Millisecond): + assert.Fail(t, "took too long to read an attribute from the list") + } + // Write a second attribute again + n, err = wr.Write([]byte(testStr)) + + assert.Equal(t, n, len(testStr)) + assert.NoError(t, err) + + select { + case attr := <-wr.ReadAttribute(): + assert.Equal(t, ".gitignore\"\n", attr.Filename) + assert.Equal(t, "linguist-vendored", attr.Attribute) + assert.Equal(t, "unspecified", attr.Value) + case <-time.After(100 * time.Millisecond): + assert.Fail(t, "took too long to read an attribute from the list") + } + + //Write a partial attribute + _, err = wr.Write([]byte("incomplete-file")) + assert.NoError(t, err) + _, err = wr.Write([]byte("name\x00")) + assert.NoError(t, err) + + select { + case <-wr.ReadAttribute(): + assert.Fail(t, "There should not be an attribute ready to read") + case <-time.After(100 * time.Millisecond): + } + _, err = wr.Write([]byte("attribute\x00")) + assert.NoError(t, err) + select { + case <-wr.ReadAttribute(): + assert.Fail(t, "There should not be an attribute ready to read") + case <-time.After(100 * time.Millisecond): + } + + _, err = wr.Write([]byte("value\x00")) + assert.NoError(t, err) + + attr := <-wr.ReadAttribute() + assert.Equal(t, "incomplete-filename", attr.Filename) + assert.Equal(t, "attribute", attr.Attribute) + assert.Equal(t, "value", attr.Value) + + _, err = wr.Write([]byte("shouldbe.vendor\x00linguist-vendored\x00set\x00shouldbe.vendor\x00linguist-generated\x00unspecified\x00shouldbe.vendor\x00linguist-language\x00unspecified\x00")) + assert.NoError(t, err) + attr = <-wr.ReadAttribute() + assert.NoError(t, err) + assert.EqualValues(t, attributeTriple{ + Filename: "shouldbe.vendor", + Attribute: "linguist-vendored", + Value: "set", + }, attr) + attr = <-wr.ReadAttribute() + assert.NoError(t, err) + assert.EqualValues(t, attributeTriple{ + Filename: "shouldbe.vendor", + Attribute: "linguist-generated", + Value: "unspecified", + }, attr) + attr = <-wr.ReadAttribute() + assert.NoError(t, err) + assert.EqualValues(t, attributeTriple{ + Filename: "shouldbe.vendor", + Attribute: "linguist-language", + Value: "unspecified", + }, attr) +} + +func Test_lineSeparatedAttributeWriter_ReadAttribute(t *testing.T) { + wr := &lineSeparatedAttributeWriter{ + attributes: make(chan attributeTriple, 5), + } + + testStr := `".gitignore\"\n": linguist-vendored: unspecified +` + n, err := wr.Write([]byte(testStr)) + + assert.Equal(t, n, len(testStr)) + assert.NoError(t, err) + + select { + case attr := <-wr.ReadAttribute(): + assert.Equal(t, ".gitignore\"\n", attr.Filename) + assert.Equal(t, "linguist-vendored", attr.Attribute) + assert.Equal(t, "unspecified", attr.Value) + case <-time.After(100 * time.Millisecond): + assert.Fail(t, "took too long to read an attribute from the list") + } + + // Write a second attribute again + n, err = wr.Write([]byte(testStr)) + + assert.Equal(t, n, len(testStr)) + assert.NoError(t, err) + + select { + case attr := <-wr.ReadAttribute(): + assert.Equal(t, ".gitignore\"\n", attr.Filename) + assert.Equal(t, "linguist-vendored", attr.Attribute) + assert.Equal(t, "unspecified", attr.Value) + case <-time.After(100 * time.Millisecond): + assert.Fail(t, "took too long to read an attribute from the list") + } + + //Write a partial attribute + _, err = wr.Write([]byte("incomplete-file")) + assert.NoError(t, err) + _, err = wr.Write([]byte("name: ")) + assert.NoError(t, err) + select { + case <-wr.ReadAttribute(): + assert.Fail(t, "There should not be an attribute ready to read") + case <-time.After(100 * time.Millisecond): + } + _, err = wr.Write([]byte("attribute: ")) + assert.NoError(t, err) + select { + case <-wr.ReadAttribute(): + assert.Fail(t, "There should not be an attribute ready to read") + case <-time.After(100 * time.Millisecond): + } + _, err = wr.Write([]byte("value\n")) + assert.NoError(t, err) + attr := <-wr.ReadAttribute() + assert.Equal(t, "incomplete-filename", attr.Filename) + assert.Equal(t, "attribute", attr.Attribute) + assert.Equal(t, "value", attr.Value) +} diff --git a/modules/git/repo_index.go b/modules/git/repo_index.go index 2c351e209f..b301ff2437 100644 --- a/modules/git/repo_index.go +++ b/modules/git/repo_index.go @@ -6,11 +6,17 @@ package git import ( "bytes" + "context" + "io/ioutil" + "os" "strings" + + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/util" ) // ReadTreeToIndex reads a treeish to the index -func (repo *Repository) ReadTreeToIndex(treeish string) error { +func (repo *Repository) ReadTreeToIndex(treeish string, indexFilename ...string) error { if len(treeish) != 40 { res, err := NewCommand("rev-parse", "--verify", treeish).RunInDir(repo.Path) if err != nil { @@ -24,17 +30,42 @@ func (repo *Repository) ReadTreeToIndex(treeish string) error { if err != nil { return err } - return repo.readTreeToIndex(id) + return repo.readTreeToIndex(id, indexFilename...) } -func (repo *Repository) readTreeToIndex(id SHA1) error { - _, err := NewCommand("read-tree", id.String()).RunInDir(repo.Path) +func (repo *Repository) readTreeToIndex(id SHA1, indexFilename ...string) error { + var env []string + if len(indexFilename) > 0 { + env = append(os.Environ(), "GIT_INDEX_FILE="+indexFilename[0]) + } + _, err := NewCommand("read-tree", id.String()).RunInDirWithEnv(repo.Path, env) if err != nil { return err } return nil } +// ReadTreeToTemporaryIndex reads a treeish to a temporary index file +func (repo *Repository) ReadTreeToTemporaryIndex(treeish string) (filename string, cancel context.CancelFunc, err error) { + tmpIndex, err := ioutil.TempFile("", "index") + if err != nil { + return + } + filename = tmpIndex.Name() + cancel = func() { + err := util.Remove(filename) + if err != nil { + log.Error("failed to remove tmp index file: %v", err) + } + } + err = repo.ReadTreeToIndex(treeish, filename) + if err != nil { + defer cancel() + return "", func() {}, err + } + return +} + // EmptyIndex empties the index func (repo *Repository) EmptyIndex() error { _, err := NewCommand("read-tree", "--empty").RunInDir(repo.Path) diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go index 0a4cfbbc7b..3abce1f077 100644 --- a/modules/git/repo_language_stats_gogit.go +++ b/modules/git/repo_language_stats_gogit.go @@ -9,10 +9,12 @@ package git import ( "bytes" + "context" "io" "io/ioutil" "code.gitea.io/gitea/modules/analyze" + "code.gitea.io/gitea/modules/log" "github.com/go-enry/go-enry/v2" "github.com/go-git/go-git/v5" @@ -42,9 +44,73 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } + var checker *CheckAttributeReader + + if CheckGitVersionAtLeast("1.7.8") == nil { + indexFilename, deleteTemporaryFile, err := repo.ReadTreeToTemporaryIndex(commitID) + if err == nil { + defer deleteTemporaryFile() + + checker = &CheckAttributeReader{ + Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language"}, + Repo: repo, + IndexFile: indexFilename, + } + ctx, cancel := context.WithCancel(DefaultContext) + if err := checker.Init(ctx); err != nil { + log.Error("Unable to open checker for %s. Error: %v", commitID, err) + } else { + go func() { + err = checker.Run() + if err != nil { + log.Error("Unable to open checker for %s. Error: %v", commitID, err) + cancel() + } + }() + } + defer cancel() + } + } + sizes := make(map[string]int64) err = tree.Files().ForEach(func(f *object.File) error { - if f.Size == 0 || analyze.IsVendor(f.Name) || enry.IsDotFile(f.Name) || + if f.Size == 0 { + return nil + } + + notVendored := false + notGenerated := false + + if checker != nil { + attrs, err := checker.CheckPath(f.Name) + if err == nil { + if vendored, has := attrs["linguist-vendored"]; has { + if vendored == "set" || vendored == "true" { + return nil + } + notVendored = vendored == "false" + } + if generated, has := attrs["linguist-generated"]; has { + if generated == "set" || generated == "true" { + return nil + } + notGenerated = generated == "false" + } + if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" { + // group languages, such as Pug -> HTML; SCSS -> CSS + group := enry.GetLanguageGroup(language) + if len(group) == 0 { + language = group + } + + sizes[language] += f.Size + + return nil + } + } + } + + if (!notVendored && analyze.IsVendor(f.Name)) || enry.IsDotFile(f.Name) || enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { return nil } @@ -54,7 +120,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err if f.Size <= bigFileSize { content, _ = readFile(f, fileSizeLimit) } - if enry.IsGenerated(f.Name, content) { + if !notGenerated && enry.IsGenerated(f.Name, content) { return nil } diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index 7425e2dbb1..c3b96ea841 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -10,6 +10,7 @@ package git import ( "bufio" "bytes" + "context" "io" "math" @@ -62,13 +63,78 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } + var checker *CheckAttributeReader + + if CheckGitVersionAtLeast("1.7.8") == nil { + indexFilename, deleteTemporaryFile, err := repo.ReadTreeToTemporaryIndex(commitID) + if err == nil { + defer deleteTemporaryFile() + + checker = &CheckAttributeReader{ + Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language"}, + Repo: repo, + IndexFile: indexFilename, + } + ctx, cancel := context.WithCancel(DefaultContext) + if err := checker.Init(ctx); err != nil { + log.Error("Unable to open checker for %s. Error: %v", commitID, err) + } else { + go func() { + err = checker.Run() + if err != nil { + log.Error("Unable to open checker for %s. Error: %v", commitID, err) + cancel() + } + }() + } + defer cancel() + } + } + contentBuf := bytes.Buffer{} var content []byte sizes := make(map[string]int64) for _, f := range entries { contentBuf.Reset() content = contentBuf.Bytes() - if f.Size() == 0 || analyze.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) || + + if f.Size() == 0 { + continue + } + + notVendored := false + notGenerated := false + + if checker != nil { + attrs, err := checker.CheckPath(f.Name()) + if err == nil { + if vendored, has := attrs["linguist-vendored"]; has { + if vendored == "set" || vendored == "true" { + continue + } + notVendored = vendored == "false" + } + if generated, has := attrs["linguist-generated"]; has { + if generated == "set" || generated == "true" { + continue + } + notGenerated = generated == "false" + } + if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" { + // group languages, such as Pug -> HTML; SCSS -> CSS + group := enry.GetLanguageGroup(language) + if len(group) == 0 { + language = group + } + + sizes[language] += f.Size() + + continue + } + } + } + + if (!notVendored && analyze.IsVendor(f.Name())) || enry.IsDotFile(f.Name()) || enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) { continue } @@ -102,11 +168,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } } - if enry.IsGenerated(f.Name(), content) { + if !notGenerated && enry.IsGenerated(f.Name(), content) { continue } - // TODO: Use .gitattributes file for linguist overrides // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? // - eg. do the all the detection tests using filename first before reading content. language := analyze.GetCodeLanguage(f.Name(), content) |