This PR adds a task to the cron service to allow garbage collection of LFS meta objects. As repositories may have a large number of LFSMetaObjects, an updated column is added to this table and it is used to perform a generational GC to attempt to reduce the amount of work. (There may need to be a bit more work here but this is probably enough for the moment.) Fix #7045 Signed-off-by: Andrew Thornton <art27@cantab.net>tags/v1.19.0-rc0
@@ -2213,6 +2213,28 @@ ROUTER = console | |||
;SCHEDULE = @every 168h | |||
;OLDER_THAN = 8760h | |||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |||
;; Garbage collect LFS pointers in repositories | |||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |||
;[cron.gc_lfs] | |||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |||
;ENABLED = false | |||
;; Garbage collect LFS pointers in repositories (default false) | |||
;RUN_AT_START = false | |||
;; Interval as a duration between each gc run (default every 24h) | |||
;SCHEDULE = @every 24h | |||
;; Only attempt to garbage collect LFSMetaObjects older than this (default 7 days) | |||
;OLDER_THAN = 168h | |||
;; Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days) | |||
;LAST_UPDATED_MORE_THAN_AGO = 72h | |||
; Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all. | |||
;NUMBER_TO_CHECK_PER_REPO = 100 | |||
;Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.) | |||
;PROPORTION_TO_CHECK_PER_REPO = 0.6 | |||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |||
;; Git Operation timeout in seconds |
@@ -1039,6 +1039,16 @@ Default templates for project boards: | |||
- `SCHEDULE`: **@every 168h**: Cron syntax to set how often to check. | |||
- `OLDER_THAN`: **@every 8760h**: any system notice older than this expression will be deleted from database. | |||
#### Cron - Garbage collect LFS pointers in repositories ('cron.gc_lfs') | |||
- `ENABLED`: **false**: Enable service. | |||
- `RUN_AT_START`: **false**: Run tasks at start up time (if ENABLED). | |||
- `SCHEDULE`: **@every 24h**: Cron syntax to set how often to check. | |||
- `OLDER_THAN`: **168h**: Only attempt to garbage collect LFSMetaObjects older than this (default 7 days) | |||
- `LAST_UPDATED_MORE_THAN_AGO`: **72h**: Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days) | |||
- `NUMBER_TO_CHECK_PER_REPO`: **100**: Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all. | |||
- `PROPORTION_TO_CHECK_PER_REPO`: **0.6**: Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.) | |||
## Git (`git`) | |||
- `PATH`: **""**: The path of Git executable. If empty, Gitea searches through the PATH environment. |
@@ -115,6 +115,7 @@ type LFSMetaObject struct { | |||
RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"` | |||
Existing bool `xorm:"-"` | |||
CreatedUnix timeutil.TimeStamp `xorm:"created"` | |||
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` | |||
} | |||
func init() { | |||
@@ -334,8 +335,45 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) { | |||
return lfsSize, nil | |||
} | |||
// IterateRepositoryIDsWithLFSMetaObjects iterates across the repositories that have LFSMetaObjects | |||
func IterateRepositoryIDsWithLFSMetaObjects(ctx context.Context, f func(ctx context.Context, repoID, count int64) error) error { | |||
batchSize := setting.Database.IterateBufferSize | |||
sess := db.GetEngine(ctx) | |||
id := int64(0) | |||
type RepositoryCount struct { | |||
RepositoryID int64 | |||
Count int64 | |||
} | |||
for { | |||
counts := make([]*RepositoryCount, 0, batchSize) | |||
sess.Select("repository_id, COUNT(id) AS count"). | |||
Table("lfs_meta_object"). | |||
Where("repository_id > ?", id). | |||
GroupBy("repository_id"). | |||
OrderBy("repository_id ASC") | |||
if err := sess.Limit(batchSize, 0).Find(&counts); err != nil { | |||
return err | |||
} | |||
if len(counts) == 0 { | |||
return nil | |||
} | |||
for _, count := range counts { | |||
if err := f(ctx, count.RepositoryID, count.Count); err != nil { | |||
return err | |||
} | |||
} | |||
id = counts[len(counts)-1].RepositoryID | |||
} | |||
} | |||
// IterateLFSMetaObjectsForRepoOptions provides options for IterateLFSMetaObjectsForRepo | |||
type IterateLFSMetaObjectsForRepoOptions struct { | |||
OlderThan time.Time | |||
OlderThan time.Time | |||
UpdatedLessRecentlyThan time.Time | |||
OrderByUpdated bool | |||
LoopFunctionAlwaysUpdates bool | |||
} | |||
// IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo | |||
@@ -348,28 +386,53 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont | |||
LFSMetaObject | |||
} | |||
id := int64(0) | |||
for { | |||
beans := make([]*CountLFSMetaObject, 0, batchSize) | |||
// SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id | |||
sess := engine.Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`"). | |||
Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid"). | |||
Where("`lfs_meta_object`.repository_id = ?", repoID) | |||
if !opts.OlderThan.IsZero() { | |||
sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan) | |||
} | |||
if !opts.UpdatedLessRecentlyThan.IsZero() { | |||
sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan) | |||
} | |||
sess.GroupBy("`lfs_meta_object`.id") | |||
if opts.OrderByUpdated { | |||
sess.OrderBy("`lfs_meta_object`.updated_unix ASC") | |||
} else { | |||
sess.And("`lfs_meta_object`.id > ?", id) | |||
sess.OrderBy("`lfs_meta_object`.id ASC") | |||
} | |||
if err := sess.Limit(batchSize, start).Find(&beans); err != nil { | |||
return err | |||
} | |||
if len(beans) == 0 { | |||
return nil | |||
} | |||
start += len(beans) | |||
if !opts.LoopFunctionAlwaysUpdates { | |||
start += len(beans) | |||
} | |||
for _, bean := range beans { | |||
if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil { | |||
return err | |||
} | |||
} | |||
id = beans[len(beans)-1].ID | |||
} | |||
} | |||
// MarkLFSMetaObject updates the updated time for the provided LFSMetaObject | |||
func MarkLFSMetaObject(ctx context.Context, id int64) error { | |||
obj := &LFSMetaObject{ | |||
UpdatedUnix: timeutil.TimeStampNow(), | |||
} | |||
count, err := db.GetEngine(ctx).ID(id).Update(obj) | |||
if count != 1 { | |||
log.Error("Unexpectedly updated %d LFSMetaObjects with ID: %d", count, id) | |||
} | |||
return err | |||
} |
@@ -432,6 +432,9 @@ var migrations = []Migration{ | |||
NewMigration("Update counts of all open milestones", v1_18.UpdateOpenMilestoneCounts), | |||
// v230 -> v231 | |||
NewMigration("Add ConfidentialClient column (default true) to OAuth2Application table", v1_18.AddConfidentialClientColumnToOAuth2ApplicationTable), | |||
// Gitea 1.18.0 ends at v231 | |||
// v231 -> v232 | |||
NewMigration("Add index for hook_task", v1_19.AddIndexForHookTask), | |||
// v232 -> v233 | |||
@@ -446,6 +449,8 @@ var migrations = []Migration{ | |||
NewMigration("Create secrets table", v1_19.CreateSecretsTable), | |||
// v237 -> v238 | |||
NewMigration("Drop ForeignReference table", v1_19.DropForeignReferenceTable), | |||
// v238 -> v239 | |||
NewMigration("Add updated unix to LFSMetaObject", v1_19.AddUpdatedUnixToLFSMetaObject), | |||
} | |||
// GetCurrentDBVersion returns the current db version |
@@ -0,0 +1,27 @@ | |||
// Copyright 2022 The Gitea Authors. All rights reserved. | |||
// SPDX-License-Identifier: MIT | |||
package v1_19 //nolint | |||
import ( | |||
"code.gitea.io/gitea/modules/timeutil" | |||
"xorm.io/xorm" | |||
) | |||
// AddUpdatedUnixToLFSMetaObject adds an updated column to the LFSMetaObject to allow for garbage collection | |||
func AddUpdatedUnixToLFSMetaObject(x *xorm.Engine) error { | |||
// Drop the table introduced in `v211`, it's considered badly designed and doesn't look like to be used. | |||
// See: https://github.com/go-gitea/gitea/issues/21086#issuecomment-1318217453 | |||
// LFSMetaObject stores metadata for LFS tracked files. | |||
type LFSMetaObject struct { | |||
ID int64 `xorm:"pk autoincr"` | |||
Oid string `json:"oid" xorm:"UNIQUE(s) INDEX NOT NULL"` | |||
Size int64 `json:"size" xorm:"NOT NULL"` | |||
RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"` | |||
CreatedUnix timeutil.TimeStamp `xorm:"created"` | |||
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` | |||
} | |||
return x.Sync(new(LFSMetaObject)) | |||
} |
@@ -6,6 +6,7 @@ package doctor | |||
import ( | |||
"context" | |||
"fmt" | |||
"time" | |||
"code.gitea.io/gitea/modules/log" | |||
"code.gitea.io/gitea/modules/setting" | |||
@@ -29,7 +30,20 @@ func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool | |||
return fmt.Errorf("LFS support is disabled") | |||
} | |||
if err := repository.GarbageCollectLFSMetaObjects(ctx, logger, autofix); err != nil { | |||
if err := repository.GarbageCollectLFSMetaObjects(ctx, repository.GarbageCollectLFSMetaObjectsOptions{ | |||
Logger: logger, | |||
AutoFix: autofix, | |||
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload | |||
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby | |||
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid | |||
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git | |||
// objects. | |||
// | |||
// It is likely that a week is potentially excessive but it should definitely be enough that any | |||
// unassociated LFS object is genuinely unassociated. | |||
OlderThan: time.Now().Add(-24 * time.Hour * 7), | |||
// We don't set the UpdatedLessRecentlyThan because we want to do a full GC | |||
}); err != nil { | |||
return err | |||
} | |||
@@ -2554,6 +2554,7 @@ dashboard.delete_old_actions = Delete all old actions from database | |||
dashboard.delete_old_actions.started = Delete all old actions from database started. | |||
dashboard.update_checker = Update checker | |||
dashboard.delete_old_system_notices = Delete all old system notices from database | |||
dashboard.gc_lfs = Garbage collect LFS meta objects | |||
users.user_manage_panel = User Account Management | |||
users.new_account = Create User Account |
@@ -175,6 +175,48 @@ func registerDeleteOldSystemNotices() { | |||
}) | |||
} | |||
func registerGCLFS() { | |||
if !setting.LFS.StartServer { | |||
return | |||
} | |||
type GCLFSConfig struct { | |||
OlderThanConfig | |||
LastUpdatedMoreThanAgo time.Duration | |||
NumberToCheckPerRepo int64 | |||
ProportionToCheckPerRepo float64 | |||
} | |||
RegisterTaskFatal("gc_lfs", &GCLFSConfig{ | |||
OlderThanConfig: OlderThanConfig{ | |||
BaseConfig: BaseConfig{ | |||
Enabled: false, | |||
RunAtStart: false, | |||
Schedule: "@every 24h", | |||
}, | |||
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload | |||
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby | |||
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid | |||
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git | |||
// objects. | |||
// | |||
// It is likely that a week is potentially excessive but it should definitely be enough that any | |||
// unassociated LFS object is genuinely unassociated. | |||
OlderThan: 24 * time.Hour * 7, | |||
}, | |||
// Only GC things that haven't been looked at in the past 3 days | |||
LastUpdatedMoreThanAgo: 24 * time.Hour * 3, | |||
NumberToCheckPerRepo: 100, | |||
ProportionToCheckPerRepo: 0.6, | |||
}, func(ctx context.Context, _ *user_model.User, config Config) error { | |||
gcLFSConfig := config.(*GCLFSConfig) | |||
return repo_service.GarbageCollectLFSMetaObjects(ctx, repo_service.GarbageCollectLFSMetaObjectsOptions{ | |||
AutoFix: true, | |||
OlderThan: time.Now().Add(-gcLFSConfig.OlderThan), | |||
UpdatedLessRecentlyThan: time.Now().Add(-gcLFSConfig.LastUpdatedMoreThanAgo), | |||
}) | |||
}) | |||
} | |||
func initExtendedTasks() { | |||
registerDeleteInactiveUsers() | |||
registerDeleteRepositoryArchives() | |||
@@ -188,4 +230,5 @@ func initExtendedTasks() { | |||
registerDeleteOldActions() | |||
registerUpdateGiteaChecker() | |||
registerDeleteOldSystemNotices() | |||
registerGCLFS() | |||
} |
@@ -5,49 +5,67 @@ package repository | |||
import ( | |||
"context" | |||
"errors" | |||
"fmt" | |||
"time" | |||
"code.gitea.io/gitea/models/db" | |||
git_model "code.gitea.io/gitea/models/git" | |||
repo_model "code.gitea.io/gitea/models/repo" | |||
"code.gitea.io/gitea/modules/git" | |||
"code.gitea.io/gitea/modules/lfs" | |||
"code.gitea.io/gitea/modules/log" | |||
"xorm.io/builder" | |||
"code.gitea.io/gitea/modules/setting" | |||
) | |||
func GarbageCollectLFSMetaObjects(ctx context.Context, logger log.Logger, autofix bool) error { | |||
// GarbageCollectLFSMetaObjectsOptions provides options for GarbageCollectLFSMetaObjects function | |||
type GarbageCollectLFSMetaObjectsOptions struct { | |||
Logger log.Logger | |||
AutoFix bool | |||
OlderThan time.Time | |||
UpdatedLessRecentlyThan time.Time | |||
NumberToCheckPerRepo int64 | |||
ProportionToCheckPerRepo float64 | |||
} | |||
// GarbageCollectLFSMetaObjects garbage collects LFS objects for all repositories | |||
func GarbageCollectLFSMetaObjects(ctx context.Context, opts GarbageCollectLFSMetaObjectsOptions) error { | |||
log.Trace("Doing: GarbageCollectLFSMetaObjects") | |||
defer log.Trace("Finished: GarbageCollectLFSMetaObjects") | |||
if err := db.Iterate( | |||
ctx, | |||
builder.And(builder.Gt{"id": 0}), | |||
func(ctx context.Context, repo *repo_model.Repository) error { | |||
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, logger, autofix) | |||
}, | |||
); err != nil { | |||
return err | |||
if !setting.LFS.StartServer { | |||
if opts.Logger != nil { | |||
opts.Logger.Info("LFS support is disabled") | |||
} | |||
return nil | |||
} | |||
log.Trace("Finished: GarbageCollectLFSMetaObjects") | |||
return nil | |||
return git_model.IterateRepositoryIDsWithLFSMetaObjects(ctx, func(ctx context.Context, repoID, count int64) error { | |||
repo, err := repo_model.GetRepositoryByID(ctx, repoID) | |||
if err != nil { | |||
return err | |||
} | |||
if newMinimum := int64(float64(count) * opts.ProportionToCheckPerRepo); newMinimum > opts.NumberToCheckPerRepo && opts.NumberToCheckPerRepo != 0 { | |||
opts.NumberToCheckPerRepo = newMinimum | |||
} | |||
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts) | |||
}) | |||
} | |||
func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, logger log.Logger, autofix bool) error { | |||
if logger != nil { | |||
logger.Info("Checking %-v", repo) | |||
// GarbageCollectLFSMetaObjectsForRepo garbage collects LFS objects for a specific repository | |||
func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, opts GarbageCollectLFSMetaObjectsOptions) error { | |||
if opts.Logger != nil { | |||
opts.Logger.Info("Checking %-v", repo) | |||
} | |||
total, orphaned, collected, deleted := 0, 0, 0, 0 | |||
if logger != nil { | |||
total, orphaned, collected, deleted := int64(0), 0, 0, 0 | |||
if opts.Logger != nil { | |||
defer func() { | |||
if orphaned == 0 { | |||
logger.Info("Found %d total LFSMetaObjects in %-v", total, repo) | |||
} else if !autofix { | |||
logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo) | |||
opts.Logger.Info("Found %d total LFSMetaObjects in %-v", total, repo) | |||
} else if !opts.AutoFix { | |||
opts.Logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo) | |||
} else { | |||
logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted) | |||
opts.Logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted) | |||
} | |||
}() | |||
} | |||
@@ -60,17 +78,21 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R | |||
defer gitRepo.Close() | |||
store := lfs.NewContentStore() | |||
errStop := errors.New("STOPERR") | |||
return git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error { | |||
err = git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error { | |||
if opts.NumberToCheckPerRepo > 0 && total > opts.NumberToCheckPerRepo { | |||
return errStop | |||
} | |||
total++ | |||
pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent())) | |||
if gitRepo.IsObjectExist(pointerSha.String()) { | |||
return nil | |||
return git_model.MarkLFSMetaObject(ctx, metaObject.ID) | |||
} | |||
orphaned++ | |||
if !autofix { | |||
if !opts.AutoFix { | |||
return nil | |||
} | |||
// Non-existent pointer file | |||
@@ -100,6 +122,19 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R | |||
// | |||
// It is likely that a week is potentially excessive but it should definitely be enough that any | |||
// unassociated LFS object is genuinely unassociated. | |||
OlderThan: time.Now().Add(-24 * 7 * time.Hour), | |||
OlderThan: opts.OlderThan, | |||
UpdatedLessRecentlyThan: opts.UpdatedLessRecentlyThan, | |||
OrderByUpdated: true, | |||
LoopFunctionAlwaysUpdates: true, | |||
}) | |||
if err == errStop { | |||
if opts.Logger != nil { | |||
opts.Logger.Info("Processing stopped at %d total LFSMetaObjects in %-v", total, repo) | |||
} | |||
return nil | |||
} else if err != nil { | |||
return err | |||
} | |||
return nil | |||
} |