Backport #24900 Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300`tags/v1.19.4
@@ -3,7 +3,46 @@ | |||
package git | |||
import ( | |||
"strings" | |||
"unicode" | |||
) | |||
const ( | |||
fileSizeLimit int64 = 16 * 1024 // 16 KiB | |||
bigFileSize int64 = 1024 * 1024 // 1 MiB | |||
) | |||
// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used. | |||
func mergeLanguageStats(stats map[string]int64) map[string]int64 { | |||
names := map[string]struct { | |||
uniqueName string | |||
upperCount int | |||
}{} | |||
countUpper := func(s string) (count int) { | |||
for _, r := range s { | |||
if unicode.IsUpper(r) { | |||
count++ | |||
} | |||
} | |||
return count | |||
} | |||
for name := range stats { | |||
cnt := countUpper(name) | |||
lower := strings.ToLower(name) | |||
if cnt >= names[lower].upperCount { | |||
names[lower] = struct { | |||
uniqueName string | |||
upperCount int | |||
}{uniqueName: name, upperCount: cnt} | |||
} | |||
} | |||
res := make(map[string]int64, len(names)) | |||
for name, num := range stats { | |||
res[names[strings.ToLower(name)].uniqueName] += num | |||
} | |||
return res | |||
} |
@@ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | |||
sizes[firstExcludedLanguage] = firstExcludedLanguageSize | |||
} | |||
return sizes, nil | |||
return mergeLanguageStats(sizes), nil | |||
} | |||
func readFile(f *object.File, limit int64) ([]byte, error) { |
@@ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | |||
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? | |||
// - eg. do the all the detection tests using filename first before reading content. | |||
language := analyze.GetCodeLanguage(f.Name(), content) | |||
if language == enry.OtherLanguage || language == "" { | |||
if language == "" { | |||
continue | |||
} | |||
@@ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | |||
included, checked := includedLanguage[language] | |||
if !checked { | |||
langtype := enry.GetLanguageType(language) | |||
included = langtype == enry.Programming || langtype == enry.Markup | |||
langType := enry.GetLanguageType(language) | |||
included = langType == enry.Programming || langType == enry.Markup | |||
includedLanguage[language] = included | |||
} | |||
if included { | |||
@@ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | |||
sizes[firstExcludedLanguage] = firstExcludedLanguageSize | |||
} | |||
return sizes, nil | |||
return mergeLanguageStats(sizes), nil | |||
} | |||
func discardFull(rd *bufio.Reader, discard int64) error { |
@@ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) { | |||
"Java": 112, | |||
}, stats) | |||
} | |||
func TestMergeLanguageStats(t *testing.T) { | |||
assert.EqualValues(t, map[string]int64{ | |||
"PHP": 1, | |||
"python": 10, | |||
"JAVA": 700, | |||
}, mergeLanguageStats(map[string]int64{ | |||
"PHP": 1, | |||
"python": 10, | |||
"Java": 100, | |||
"java": 200, | |||
"JAVA": 400, | |||
})) | |||
} |