]> source.dussan.org Git - gitea.git/commitdiff
Speed up `enry.IsVendor` (#15213)
authorzeripath <art27@cantab.net>
Thu, 1 Apr 2021 17:41:09 +0000 (18:41 +0100)
committerGitHub <noreply@github.com>
Thu, 1 Apr 2021 17:41:09 +0000 (19:41 +0200)
`enry.IsVendor` is kinda slow as it simply iterates across all regexps.
This PR ajdusts the regexps to combine them to make this process a
little quicker.

Related #15143

Signed-off-by: Andrew Thornton <art27@cantab.net>
modules/analyze/vendor.go [new file with mode: 0644]
modules/analyze/vendor_test.go [new file with mode: 0644]
modules/git/repo_language_stats_gogit.go
modules/git/repo_language_stats_nogogit.go
modules/indexer/code/bleve.go
modules/indexer/code/elastic_search.go

diff --git a/modules/analyze/vendor.go b/modules/analyze/vendor.go
new file mode 100644 (file)
index 0000000..12ae8db
--- /dev/null
@@ -0,0 +1,70 @@
+// Copyright 2021 The Gitea Authors. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package analyze
+
+import (
+       "regexp"
+       "sort"
+       "strings"
+
+       "github.com/go-enry/go-enry/v2/data"
+)
+
+var isVendorRegExp *regexp.Regexp
+
+func init() {
+       matchers := data.VendorMatchers
+
+       caretStrings := make([]string, 0, 10)
+       caretShareStrings := make([]string, 0, 10)
+
+       matcherStrings := make([]string, 0, len(matchers))
+       for _, matcher := range matchers {
+               str := matcher.String()
+               if str[0] == '^' {
+                       caretStrings = append(caretStrings, str[1:])
+               } else if str[0:5] == "(^|/)" {
+                       caretShareStrings = append(caretShareStrings, str[5:])
+               } else {
+                       matcherStrings = append(matcherStrings, str)
+               }
+       }
+
+       sort.Strings(caretShareStrings)
+       sort.Strings(caretStrings)
+       sort.Strings(matcherStrings)
+
+       sb := &strings.Builder{}
+       sb.WriteString("(?:^(?:")
+       sb.WriteString(caretStrings[0])
+       for _, matcher := range caretStrings[1:] {
+               sb.WriteString(")|(?:")
+               sb.WriteString(matcher)
+       }
+       sb.WriteString("))")
+       sb.WriteString("|")
+       sb.WriteString("(?:(?:^|/)(?:")
+       sb.WriteString(caretShareStrings[0])
+       for _, matcher := range caretShareStrings[1:] {
+               sb.WriteString(")|(?:")
+               sb.WriteString(matcher)
+       }
+       sb.WriteString("))")
+       sb.WriteString("|")
+       sb.WriteString("(?:")
+       sb.WriteString(matcherStrings[0])
+       for _, matcher := range matcherStrings[1:] {
+               sb.WriteString(")|(?:")
+               sb.WriteString(matcher)
+       }
+       sb.WriteString(")")
+       combined := sb.String()
+       isVendorRegExp = regexp.MustCompile(combined)
+}
+
+// IsVendor returns whether or not path is a vendor path.
+func IsVendor(path string) bool {
+       return isVendorRegExp.MatchString(path)
+}
diff --git a/modules/analyze/vendor_test.go b/modules/analyze/vendor_test.go
new file mode 100644 (file)
index 0000000..2784e49
--- /dev/null
@@ -0,0 +1,42 @@
+// Copyright 2021 The Gitea Authors. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package analyze
+
+import "testing"
+
+func TestIsVendor(t *testing.T) {
+       tests := []struct {
+               path string
+               want bool
+       }{
+               {"cache/", true},
+               {"random/cache/", true},
+               {"cache", false},
+               {"dependencies/", true},
+               {"Dependencies/", true},
+               {"dependency/", false},
+               {"dist/", true},
+               {"dist", false},
+               {"random/dist/", true},
+               {"random/dist", false},
+               {"deps/", true},
+               {"configure", true},
+               {"a/configure", true},
+               {"config.guess", true},
+               {"config.guess/", false},
+               {".vscode/", true},
+               {"doc/_build/", true},
+               {"a/docs/_build/", true},
+               {"a/dasdocs/_build-vsdoc.js", true},
+               {"a/dasdocs/_build-vsdoc.j", false},
+       }
+       for _, tt := range tests {
+               t.Run(tt.path, func(t *testing.T) {
+                       if got := IsVendor(tt.path); got != tt.want {
+                               t.Errorf("IsVendor() = %v, want %v", got, tt.want)
+                       }
+               })
+       }
+}
index b5a235921c8ae02dee14c8a68caaa910a304b220..20a7b061f21070888fb189db3104b31ecdde5f10 100644 (file)
@@ -43,7 +43,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 
        sizes := make(map[string]int64)
        err = tree.Files().ForEach(func(f *object.File) error {
-               if f.Size == 0 || enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) ||
+               if f.Size == 0 || analyze.IsVendor(f.Name) || enry.IsDotFile(f.Name) ||
                        enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) {
                        return nil
                }
index a929d7953b54b58db12dc15ea93ad96fd02808e6..3f197f8d74e9d162580f0e26cb5e92fa2ac712c5 100644 (file)
@@ -67,7 +67,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
        for _, f := range entries {
                contentBuf.Reset()
                content = contentBuf.Bytes()
-               if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) ||
+               if f.Size() == 0 || analyze.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) ||
                        enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
                        continue
                }
index 573ea8b88cbc4931084465be13de7eff48816e54..416adeea74f2c2b5cf02c5cdda0ba40a4224e847 100644 (file)
@@ -178,7 +178,7 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
 
 func (b *BleveIndexer) addUpdate(batchWriter *io.PipeWriter, batchReader *bufio.Reader, commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
        // Ignore vendored files in code search
-       if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
+       if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
                return nil
        }
 
index 5327eb1e51e9d0f76b2a6756d784b2ae66266b2b..ebb7910fdcb40f26021e54f73707079cecf3bbec 100644 (file)
@@ -177,7 +177,7 @@ func (b *ElasticSearchIndexer) init() (bool, error) {
 
 func (b *ElasticSearchIndexer) addUpdate(batchWriter *io.PipeWriter, batchReader *bufio.Reader, sha string, update fileUpdate, repo *models.Repository) ([]elastic.BulkableRequest, error) {
        // Ignore vendored files in code search
-       if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
+       if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
                return nil, nil
        }