From 1a26f6c7abef670be624c38f626acd1dee5c2bfa Mon Sep 17 00:00:00 2001 From: zeripath Date: Thu, 1 Apr 2021 23:50:12 +0100 Subject: [PATCH] Speed up `enry.IsVendor` (#15213) (#15246) Backport #15213 `enry.IsVendor` is kinda slow as it simply iterates across all regexps. This PR ajdusts the regexps to combine them to make this process a little quicker. Related #15143 Signed-off-by: Andrew Thornton --- modules/analyze/vendor.go | 70 ++++++++++++++++++++++++++ modules/analyze/vendor_test.go | 42 ++++++++++++++++ modules/git/repo_language_stats.go | 2 +- modules/indexer/code/bleve.go | 2 +- modules/indexer/code/elastic_search.go | 2 +- 5 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 modules/analyze/vendor.go create mode 100644 modules/analyze/vendor_test.go diff --git a/modules/analyze/vendor.go b/modules/analyze/vendor.go new file mode 100644 index 0000000000..12ae8dbd80 --- /dev/null +++ b/modules/analyze/vendor.go @@ -0,0 +1,70 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package analyze + +import ( + "regexp" + "sort" + "strings" + + "github.com/go-enry/go-enry/v2/data" +) + +var isVendorRegExp *regexp.Regexp + +func init() { + matchers := data.VendorMatchers + + caretStrings := make([]string, 0, 10) + caretShareStrings := make([]string, 0, 10) + + matcherStrings := make([]string, 0, len(matchers)) + for _, matcher := range matchers { + str := matcher.String() + if str[0] == '^' { + caretStrings = append(caretStrings, str[1:]) + } else if str[0:5] == "(^|/)" { + caretShareStrings = append(caretShareStrings, str[5:]) + } else { + matcherStrings = append(matcherStrings, str) + } + } + + sort.Strings(caretShareStrings) + sort.Strings(caretStrings) + sort.Strings(matcherStrings) + + sb := &strings.Builder{} + sb.WriteString("(?:^(?:") + sb.WriteString(caretStrings[0]) + for _, matcher := range caretStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString("))") + sb.WriteString("|") + sb.WriteString("(?:(?:^|/)(?:") + sb.WriteString(caretShareStrings[0]) + for _, matcher := range caretShareStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString("))") + sb.WriteString("|") + sb.WriteString("(?:") + sb.WriteString(matcherStrings[0]) + for _, matcher := range matcherStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString(")") + combined := sb.String() + isVendorRegExp = regexp.MustCompile(combined) +} + +// IsVendor returns whether or not path is a vendor path. +func IsVendor(path string) bool { + return isVendorRegExp.MatchString(path) +} diff --git a/modules/analyze/vendor_test.go b/modules/analyze/vendor_test.go new file mode 100644 index 0000000000..2784e49d34 --- /dev/null +++ b/modules/analyze/vendor_test.go @@ -0,0 +1,42 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package analyze + +import "testing" + +func TestIsVendor(t *testing.T) { + tests := []struct { + path string + want bool + }{ + {"cache/", true}, + {"random/cache/", true}, + {"cache", false}, + {"dependencies/", true}, + {"Dependencies/", true}, + {"dependency/", false}, + {"dist/", true}, + {"dist", false}, + {"random/dist/", true}, + {"random/dist", false}, + {"deps/", true}, + {"configure", true}, + {"a/configure", true}, + {"config.guess", true}, + {"config.guess/", false}, + {".vscode/", true}, + {"doc/_build/", true}, + {"a/docs/_build/", true}, + {"a/dasdocs/_build-vsdoc.js", true}, + {"a/dasdocs/_build-vsdoc.j", false}, + } + for _, tt := range tests { + t.Run(tt.path, func(t *testing.T) { + if got := IsVendor(tt.path); got != tt.want { + t.Errorf("IsVendor() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/modules/git/repo_language_stats.go b/modules/git/repo_language_stats.go index b721b996e4..573cf5df6a 100644 --- a/modules/git/repo_language_stats.go +++ b/modules/git/repo_language_stats.go @@ -44,7 +44,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err sizes := make(map[string]int64) err = tree.Files().ForEach(func(f *object.File) error { - if f.Size == 0 || enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || + if f.Size == 0 || analyze.IsVendor(f.Name) || enry.IsDotFile(f.Name) || enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { return nil } diff --git a/modules/indexer/code/bleve.go b/modules/indexer/code/bleve.go index 9caa6528f7..7458717ccc 100644 --- a/modules/indexer/code/bleve.go +++ b/modules/indexer/code/bleve.go @@ -175,7 +175,7 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) { func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error { // Ignore vendored files in code search - if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { + if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { return nil } diff --git a/modules/indexer/code/elastic_search.go b/modules/indexer/code/elastic_search.go index 08b20b80a0..fd37462054 100644 --- a/modules/indexer/code/elastic_search.go +++ b/modules/indexer/code/elastic_search.go @@ -170,7 +170,7 @@ func (b *ElasticSearchIndexer) init() (bool, error) { func (b *ElasticSearchIndexer) addUpdate(sha string, update fileUpdate, repo *models.Repository) ([]elastic.BulkableRequest, error) { // Ignore vendored files in code search - if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { + if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { return nil, nil } -- 2.39.5