diff options
author | wxiaoguang <wxiaoguang@gmail.com> | 2023-05-25 03:37:36 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-24 19:37:36 +0000 |
commit | 395bb33e4c9712e570f597c3ca24a4c3c6acdaff (patch) | |
tree | 66a3f0ab0f962ce102fd345cd3dcf2d1b32f5a6c /modules | |
parent | 63d5e762d8813881bd7449ac5c05ae95cc3a93eb (diff) | |
download | gitea-395bb33e4c9712e570f597c3ca24a4c3c6acdaff.tar.gz gitea-395bb33e4c9712e570f597c3ca24a4c3c6acdaff.zip |
Merge different languages for language stats (#24900)
Fix #24896
If users set different languages by `linguist-language`, the `stats` map
could be: `java: 100, Java: 200`.
Language stats are stored as case-insensitive in database and there is a
unique key.
So, the different language names should be merged to one unique name:
`Java: 300`
Diffstat (limited to 'modules')
-rw-r--r-- | modules/git/repo_language_stats.go | 39 | ||||
-rw-r--r-- | modules/git/repo_language_stats_gogit.go | 2 | ||||
-rw-r--r-- | modules/git/repo_language_stats_nogogit.go | 8 | ||||
-rw-r--r-- | modules/git/repo_language_stats_test.go | 14 | ||||
-rw-r--r-- | modules/log/logger_global.go | 2 |
5 files changed, 59 insertions, 6 deletions
diff --git a/modules/git/repo_language_stats.go b/modules/git/repo_language_stats.go index 74c76b40a4..c40d6937b5 100644 --- a/modules/git/repo_language_stats.go +++ b/modules/git/repo_language_stats.go @@ -3,7 +3,46 @@ package git +import ( + "strings" + "unicode" +) + const ( fileSizeLimit int64 = 16 * 1024 // 16 KiB bigFileSize int64 = 1024 * 1024 // 1 MiB ) + +// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used. +func mergeLanguageStats(stats map[string]int64) map[string]int64 { + names := map[string]struct { + uniqueName string + upperCount int + }{} + + countUpper := func(s string) (count int) { + for _, r := range s { + if unicode.IsUpper(r) { + count++ + } + } + return count + } + + for name := range stats { + cnt := countUpper(name) + lower := strings.ToLower(name) + if cnt >= names[lower].upperCount { + names[lower] = struct { + uniqueName string + upperCount int + }{uniqueName: name, upperCount: cnt} + } + } + + res := make(map[string]int64, len(names)) + for name, num := range stats { + res[names[strings.ToLower(name)].uniqueName] += num + } + return res +} diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go index 9674eca275..4c6fbd6c7e 100644 --- a/modules/git/repo_language_stats_gogit.go +++ b/modules/git/repo_language_stats_gogit.go @@ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err sizes[firstExcludedLanguage] = firstExcludedLanguageSize } - return sizes, nil + return mergeLanguageStats(sizes), nil } func readFile(f *object.File, limit int64) ([]byte, error) { diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index a1d28f40e8..1d94ad6c00 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? // - eg. do the all the detection tests using filename first before reading content. language := analyze.GetCodeLanguage(f.Name(), content) - if language == enry.OtherLanguage || language == "" { + if language == "" { continue } @@ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err included, checked := includedLanguage[language] if !checked { - langtype := enry.GetLanguageType(language) - included = langtype == enry.Programming || langtype == enry.Markup + langType := enry.GetLanguageType(language) + included = langType == enry.Programming || langType == enry.Markup includedLanguage[language] = included } if included { @@ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err sizes[firstExcludedLanguage] = firstExcludedLanguageSize } - return sizes, nil + return mergeLanguageStats(sizes), nil } func discardFull(rd *bufio.Reader, discard int64) error { diff --git a/modules/git/repo_language_stats_test.go b/modules/git/repo_language_stats_test.go index d616d95741..da3871e909 100644 --- a/modules/git/repo_language_stats_test.go +++ b/modules/git/repo_language_stats_test.go @@ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) { "Java": 112, }, stats) } + +func TestMergeLanguageStats(t *testing.T) { + assert.EqualValues(t, map[string]int64{ + "PHP": 1, + "python": 10, + "JAVA": 700, + }, mergeLanguageStats(map[string]int64{ + "PHP": 1, + "python": 10, + "Java": 100, + "java": 200, + "JAVA": 400, + })) +} diff --git a/modules/log/logger_global.go b/modules/log/logger_global.go index f100341254..5ccef34b5b 100644 --- a/modules/log/logger_global.go +++ b/modules/log/logger_global.go @@ -10,7 +10,7 @@ import ( // FallbackErrorf is the last chance to show an error if the logger has internal errors func FallbackErrorf(format string, args ...any) { - _, _ = fmt.Fprintf(os.Stderr, format+"\n", args) + _, _ = fmt.Fprintf(os.Stderr, format+"\n", args...) } func GetLevel() Level { |