* Change language statistics to save size instead of percentage (#11681) * Change language statistics to save size instead of percentage in database Co-Authored-By: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> * Do not exclude if only language * Fix edge cases with special langauges Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> * Fix language stat calculation (#11692) * Fix language stat calculation * Group languages and ignore 0 size files * remove unneeded code Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com>tags/v1.12.0-rc2
NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn), | NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn), | ||||
// v139 -> v140 | // v139 -> v140 | ||||
NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs), | NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs), | ||||
// v140 -> v141 | |||||
NewMigration("Save detected language file size to database instead of percent", fixLanguageStatsToSaveSize), | |||||
} | } | ||||
// GetCurrentDBVersion returns the current db version | // GetCurrentDBVersion returns the current db version |
// Copyright 2020 The Gitea Authors. All rights reserved. | |||||
// Use of this source code is governed by a MIT-style | |||||
// license that can be found in the LICENSE file. | |||||
package migrations | |||||
import ( | |||||
"fmt" | |||||
"code.gitea.io/gitea/modules/setting" | |||||
"xorm.io/xorm" | |||||
) | |||||
func fixLanguageStatsToSaveSize(x *xorm.Engine) error { | |||||
// LanguageStat see models/repo_language_stats.go | |||||
type LanguageStat struct { | |||||
Size int64 `xorm:"NOT NULL DEFAULT 0"` | |||||
} | |||||
// RepoIndexerType specifies the repository indexer type | |||||
type RepoIndexerType int | |||||
const ( | |||||
// RepoIndexerTypeCode code indexer | |||||
RepoIndexerTypeCode RepoIndexerType = iota // 0 | |||||
// RepoIndexerTypeStats repository stats indexer | |||||
RepoIndexerTypeStats // 1 | |||||
) | |||||
// RepoIndexerStatus see models/repo_indexer.go | |||||
type RepoIndexerStatus struct { | |||||
IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"` | |||||
} | |||||
if err := x.Sync2(new(LanguageStat)); err != nil { | |||||
return fmt.Errorf("Sync2: %v", err) | |||||
} | |||||
x.Delete(&RepoIndexerStatus{IndexerType: RepoIndexerTypeStats}) | |||||
// Delete language stat statuses | |||||
truncExpr := "TRUNCATE TABLE" | |||||
if setting.Database.UseSQLite3 { | |||||
truncExpr = "DELETE FROM" | |||||
} | |||||
// Delete language stats | |||||
if _, err := x.Exec(fmt.Sprintf("%s language_stat", truncExpr)); err != nil { | |||||
return err | |||||
} | |||||
sess := x.NewSession() | |||||
defer sess.Close() | |||||
return dropTableColumns(sess, "language_stat", "percentage") | |||||
} |
CommitID string | CommitID string | ||||
IsPrimary bool | IsPrimary bool | ||||
Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` | Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` | ||||
Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"` | |||||
Percentage float32 `xorm:"-"` | |||||
Size int64 `xorm:"NOT NULL DEFAULT 0"` | |||||
Color string `xorm:"-"` | Color string `xorm:"-"` | ||||
CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` | CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` | ||||
} | } | ||||
} | } | ||||
} | } | ||||
func (stats LanguageStatList) getLanguagePercentages() map[string]float32 { | |||||
langPerc := make(map[string]float32) | |||||
var otherPerc float32 = 100 | |||||
var total int64 | |||||
for _, stat := range stats { | |||||
total += stat.Size | |||||
} | |||||
if total > 0 { | |||||
for _, stat := range stats { | |||||
perc := float32(math.Round(float64(stat.Size)/float64(total)*1000) / 10) | |||||
if perc <= 0.1 { | |||||
continue | |||||
} | |||||
otherPerc -= perc | |||||
langPerc[stat.Language] = perc | |||||
} | |||||
otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) | |||||
} | |||||
if otherPerc > 0 { | |||||
langPerc["other"] = otherPerc | |||||
} | |||||
return langPerc | |||||
} | |||||
func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) { | func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) { | ||||
stats := make(LanguageStatList, 0, 6) | stats := make(LanguageStatList, 0, 6) | ||||
if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil { | |||||
if err := e.Where("`repo_id` = ?", repo.ID).Desc("`size`").Find(&stats); err != nil { | |||||
return nil, err | return nil, err | ||||
} | } | ||||
stats.loadAttributes() | |||||
return stats, nil | return stats, nil | ||||
} | } | ||||
if err != nil { | if err != nil { | ||||
return nil, err | return nil, err | ||||
} | } | ||||
perc := stats.getLanguagePercentages() | |||||
topstats := make(LanguageStatList, 0, limit) | topstats := make(LanguageStatList, 0, limit) | ||||
var other float32 | var other float32 | ||||
for i := range stats { | for i := range stats { | ||||
if _, ok := perc[stats[i].Language]; !ok { | |||||
continue | |||||
} | |||||
if stats[i].Language == "other" || len(topstats) >= limit { | if stats[i].Language == "other" || len(topstats) >= limit { | ||||
other += stats[i].Percentage | |||||
other += perc[stats[i].Language] | |||||
continue | continue | ||||
} | } | ||||
stats[i].Percentage = perc[stats[i].Language] | |||||
topstats = append(topstats, stats[i]) | topstats = append(topstats, stats[i]) | ||||
} | } | ||||
if other > 0 { | if other > 0 { | ||||
Percentage: float32(math.Round(float64(other)*10) / 10), | Percentage: float32(math.Round(float64(other)*10) / 10), | ||||
}) | }) | ||||
} | } | ||||
topstats.loadAttributes() | |||||
return topstats, nil | return topstats, nil | ||||
} | } | ||||
// UpdateLanguageStats updates the language statistics for repository | // UpdateLanguageStats updates the language statistics for repository | ||||
func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error { | |||||
func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]int64) error { | |||||
sess := x.NewSession() | sess := x.NewSession() | ||||
if err := sess.Begin(); err != nil { | if err := sess.Begin(); err != nil { | ||||
return err | return err | ||||
return err | return err | ||||
} | } | ||||
var topLang string | var topLang string | ||||
var p float32 | |||||
for lang, perc := range stats { | |||||
if perc > p { | |||||
p = perc | |||||
var s int64 | |||||
for lang, size := range stats { | |||||
if size > s { | |||||
s = size | |||||
topLang = strings.ToLower(lang) | topLang = strings.ToLower(lang) | ||||
} | } | ||||
} | } | ||||
for lang, perc := range stats { | |||||
for lang, size := range stats { | |||||
upd := false | upd := false | ||||
llang := strings.ToLower(lang) | llang := strings.ToLower(lang) | ||||
for _, s := range oldstats { | for _, s := range oldstats { | ||||
if strings.ToLower(s.Language) == llang { | if strings.ToLower(s.Language) == llang { | ||||
s.CommitID = commitID | s.CommitID = commitID | ||||
s.IsPrimary = llang == topLang | s.IsPrimary = llang == topLang | ||||
s.Percentage = perc | |||||
if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil { | |||||
s.Size = size | |||||
if _, err := sess.ID(s.ID).Cols("`commit_id`", "`size`", "`is_primary`").Update(s); err != nil { | |||||
return err | return err | ||||
} | } | ||||
upd = true | upd = true | ||||
// Insert new language | // Insert new language | ||||
if !upd { | if !upd { | ||||
if _, err := sess.Insert(&LanguageStat{ | if _, err := sess.Insert(&LanguageStat{ | ||||
RepoID: repo.ID, | |||||
CommitID: commitID, | |||||
IsPrimary: llang == topLang, | |||||
Language: lang, | |||||
Percentage: perc, | |||||
RepoID: repo.ID, | |||||
CommitID: commitID, | |||||
IsPrimary: llang == topLang, | |||||
Language: lang, | |||||
Size: size, | |||||
}); err != nil { | }); err != nil { | ||||
return err | return err | ||||
} | } | ||||
return err | return err | ||||
} | } | ||||
RepoLang := make(LanguageStatList, 0, 6) | RepoLang := make(LanguageStatList, 0, 6) | ||||
if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`percentage`").Find(&RepoLang); err != nil { | |||||
if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`size`").Find(&RepoLang); err != nil { | |||||
return err | return err | ||||
} | } | ||||
if len(RepoLang) > 0 { | if len(RepoLang) > 0 { |
"bytes" | "bytes" | ||||
"io" | "io" | ||||
"io/ioutil" | "io/ioutil" | ||||
"math" | |||||
"code.gitea.io/gitea/modules/analyze" | "code.gitea.io/gitea/modules/analyze" | ||||
const fileSizeLimit int64 = 16 * 1024 * 1024 | const fileSizeLimit int64 = 16 * 1024 * 1024 | ||||
// specialLanguages defines list of languages that are excluded from the calculation | |||||
// unless they are the only language present in repository. Only languages which under | |||||
// normal circumstances are not considered to be code should be listed here. | |||||
var specialLanguages = []string{ | |||||
"XML", | |||||
"JSON", | |||||
"TOML", | |||||
"YAML", | |||||
"INI", | |||||
"SVG", | |||||
"Text", | |||||
"Markdown", | |||||
} | |||||
// GetLanguageStats calculates language stats for git repository at specified commit | // GetLanguageStats calculates language stats for git repository at specified commit | ||||
func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) { | |||||
func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { | |||||
r, err := git.PlainOpen(repo.Path) | r, err := git.PlainOpen(repo.Path) | ||||
if err != nil { | if err != nil { | ||||
return nil, err | return nil, err | ||||
} | } | ||||
sizes := make(map[string]int64) | sizes := make(map[string]int64) | ||||
var total int64 | |||||
err = tree.Files().ForEach(func(f *object.File) error { | err = tree.Files().ForEach(func(f *object.File) error { | ||||
if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || | |||||
if f.Size == 0 || enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || | |||||
enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { | enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { | ||||
return nil | return nil | ||||
} | } | ||||
return nil | return nil | ||||
} | } | ||||
// group languages, such as Pug -> HTML; SCSS -> CSS | |||||
group := enry.GetLanguageGroup(language) | |||||
if group != "" { | |||||
language = group | |||||
} | |||||
sizes[language] += f.Size | sizes[language] += f.Size | ||||
total += f.Size | |||||
return nil | return nil | ||||
}) | }) | ||||
return nil, err | return nil, err | ||||
} | } | ||||
stats := make(map[string]float32) | |||||
var otherPerc float32 = 100 | |||||
for language, size := range sizes { | |||||
perc := float32(math.Round(float64(size)/float64(total)*1000) / 10) | |||||
if perc <= 0.1 { | |||||
continue | |||||
// filter special languages unless they are the only language | |||||
if len(sizes) > 1 { | |||||
for _, language := range specialLanguages { | |||||
delete(sizes, language) | |||||
} | } | ||||
otherPerc -= perc | |||||
stats[language] = perc | |||||
} | |||||
otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) | |||||
if otherPerc > 0 { | |||||
stats["other"] = otherPerc | |||||
} | } | ||||
return stats, nil | |||||
return sizes, nil | |||||
} | } | ||||
func readFile(f *object.File, limit int64) ([]byte, error) { | func readFile(f *object.File, limit int64) ([]byte, error) { |
repo, err := models.GetRepositoryByID(1) | repo, err := models.GetRepositoryByID(1) | ||||
assert.NoError(t, err) | assert.NoError(t, err) | ||||
status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats) | |||||
assert.NoError(t, err) | |||||
assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha) | |||||
langs, err := repo.GetTopLanguageStats(5) | langs, err := repo.GetTopLanguageStats(5) | ||||
assert.NoError(t, err) | assert.NoError(t, err) | ||||
assert.Len(t, langs, 1) | |||||
assert.Equal(t, "other", langs[0].Language) | |||||
assert.Equal(t, float32(100), langs[0].Percentage) | |||||
assert.Empty(t, langs) | |||||
} | } |