summaryrefslogtreecommitdiffstats
path: root/modules/git
diff options
context:
space:
mode:
authorwxiaoguang <wxiaoguang@gmail.com>2023-05-25 03:37:36 +0800
committerGitHub <noreply@github.com>2023-05-24 19:37:36 +0000
commit395bb33e4c9712e570f597c3ca24a4c3c6acdaff (patch)
tree66a3f0ab0f962ce102fd345cd3dcf2d1b32f5a6c /modules/git
parent63d5e762d8813881bd7449ac5c05ae95cc3a93eb (diff)
downloadgitea-395bb33e4c9712e570f597c3ca24a4c3c6acdaff.tar.gz
gitea-395bb33e4c9712e570f597c3ca24a4c3c6acdaff.zip
Merge different languages for language stats (#24900)
Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300`
Diffstat (limited to 'modules/git')
-rw-r--r--modules/git/repo_language_stats.go39
-rw-r--r--modules/git/repo_language_stats_gogit.go2
-rw-r--r--modules/git/repo_language_stats_nogogit.go8
-rw-r--r--modules/git/repo_language_stats_test.go14
4 files changed, 58 insertions, 5 deletions
diff --git a/modules/git/repo_language_stats.go b/modules/git/repo_language_stats.go
index 74c76b40a4..c40d6937b5 100644
--- a/modules/git/repo_language_stats.go
+++ b/modules/git/repo_language_stats.go
@@ -3,7 +3,46 @@
package git
+import (
+ "strings"
+ "unicode"
+)
+
const (
fileSizeLimit int64 = 16 * 1024 // 16 KiB
bigFileSize int64 = 1024 * 1024 // 1 MiB
)
+
+// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
+func mergeLanguageStats(stats map[string]int64) map[string]int64 {
+ names := map[string]struct {
+ uniqueName string
+ upperCount int
+ }{}
+
+ countUpper := func(s string) (count int) {
+ for _, r := range s {
+ if unicode.IsUpper(r) {
+ count++
+ }
+ }
+ return count
+ }
+
+ for name := range stats {
+ cnt := countUpper(name)
+ lower := strings.ToLower(name)
+ if cnt >= names[lower].upperCount {
+ names[lower] = struct {
+ uniqueName string
+ upperCount int
+ }{uniqueName: name, upperCount: cnt}
+ }
+ }
+
+ res := make(map[string]int64, len(names))
+ for name, num := range stats {
+ res[names[strings.ToLower(name)].uniqueName] += num
+ }
+ return res
+}
diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go
index 9674eca275..4c6fbd6c7e 100644
--- a/modules/git/repo_language_stats_gogit.go
+++ b/modules/git/repo_language_stats_gogit.go
@@ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
}
- return sizes, nil
+ return mergeLanguageStats(sizes), nil
}
func readFile(f *object.File, limit int64) ([]byte, error) {
diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go
index a1d28f40e8..1d94ad6c00 100644
--- a/modules/git/repo_language_stats_nogogit.go
+++ b/modules/git/repo_language_stats_nogogit.go
@@ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content.
language := analyze.GetCodeLanguage(f.Name(), content)
- if language == enry.OtherLanguage || language == "" {
+ if language == "" {
continue
}
@@ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
included, checked := includedLanguage[language]
if !checked {
- langtype := enry.GetLanguageType(language)
- included = langtype == enry.Programming || langtype == enry.Markup
+ langType := enry.GetLanguageType(language)
+ included = langType == enry.Programming || langType == enry.Markup
includedLanguage[language] = included
}
if included {
@@ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
}
- return sizes, nil
+ return mergeLanguageStats(sizes), nil
}
func discardFull(rd *bufio.Reader, discard int64) error {
diff --git a/modules/git/repo_language_stats_test.go b/modules/git/repo_language_stats_test.go
index d616d95741..da3871e909 100644
--- a/modules/git/repo_language_stats_test.go
+++ b/modules/git/repo_language_stats_test.go
@@ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) {
"Java": 112,
}, stats)
}
+
+func TestMergeLanguageStats(t *testing.T) {
+ assert.EqualValues(t, map[string]int64{
+ "PHP": 1,
+ "python": 10,
+ "JAVA": 700,
+ }, mergeLanguageStats(map[string]int64{
+ "PHP": 1,
+ "python": 10,
+ "Java": 100,
+ "java": 200,
+ "JAVA": 400,
+ }))
+}