Browse Source

Add cron method to gc LFS MetaObjects (#22385)

This PR adds a task to the cron service to allow garbage collection of
LFS meta objects. As repositories may have a large number of
LFSMetaObjects, an updated column is added to this table and it is used
to perform a generational GC to attempt to reduce the amount of work.
(There may need to be a bit more work here but this is probably enough
for the moment.)

Fix #7045

Signed-off-by: Andrew Thornton <art27@cantab.net>
tags/v1.19.0-rc0
zeripath 1 year ago
parent
commit
2cc3a6381c
No account linked to committer's email address

+ 22
- 0
custom/conf/app.example.ini View File

;SCHEDULE = @every 168h ;SCHEDULE = @every 168h
;OLDER_THAN = 8760h ;OLDER_THAN = 8760h


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Garbage collect LFS pointers in repositories
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;[cron.gc_lfs]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;ENABLED = false
;; Garbage collect LFS pointers in repositories (default false)
;RUN_AT_START = false
;; Interval as a duration between each gc run (default every 24h)
;SCHEDULE = @every 24h
;; Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
;OLDER_THAN = 168h
;; Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
;LAST_UPDATED_MORE_THAN_AGO = 72h
; Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all.
;NUMBER_TO_CHECK_PER_REPO = 100
;Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.)
;PROPORTION_TO_CHECK_PER_REPO = 0.6


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Git Operation timeout in seconds ;; Git Operation timeout in seconds

+ 10
- 0
docs/content/doc/advanced/config-cheat-sheet.en-us.md View File

- `SCHEDULE`: **@every 168h**: Cron syntax to set how often to check. - `SCHEDULE`: **@every 168h**: Cron syntax to set how often to check.
- `OLDER_THAN`: **@every 8760h**: any system notice older than this expression will be deleted from database. - `OLDER_THAN`: **@every 8760h**: any system notice older than this expression will be deleted from database.


#### Cron - Garbage collect LFS pointers in repositories ('cron.gc_lfs')

- `ENABLED`: **false**: Enable service.
- `RUN_AT_START`: **false**: Run tasks at start up time (if ENABLED).
- `SCHEDULE`: **@every 24h**: Cron syntax to set how often to check.
- `OLDER_THAN`: **168h**: Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
- `LAST_UPDATED_MORE_THAN_AGO`: **72h**: Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
- `NUMBER_TO_CHECK_PER_REPO`: **100**: Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all.
- `PROPORTION_TO_CHECK_PER_REPO`: **0.6**: Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.)

## Git (`git`) ## Git (`git`)


- `PATH`: **""**: The path of Git executable. If empty, Gitea searches through the PATH environment. - `PATH`: **""**: The path of Git executable. If empty, Gitea searches through the PATH environment.

+ 66
- 3
models/git/lfs.go View File

RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"` RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
Existing bool `xorm:"-"` Existing bool `xorm:"-"`
CreatedUnix timeutil.TimeStamp `xorm:"created"` CreatedUnix timeutil.TimeStamp `xorm:"created"`
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
} }


func init() { func init() {
return lfsSize, nil return lfsSize, nil
} }


// IterateRepositoryIDsWithLFSMetaObjects iterates across the repositories that have LFSMetaObjects
func IterateRepositoryIDsWithLFSMetaObjects(ctx context.Context, f func(ctx context.Context, repoID, count int64) error) error {
batchSize := setting.Database.IterateBufferSize
sess := db.GetEngine(ctx)
id := int64(0)
type RepositoryCount struct {
RepositoryID int64
Count int64
}
for {
counts := make([]*RepositoryCount, 0, batchSize)
sess.Select("repository_id, COUNT(id) AS count").
Table("lfs_meta_object").
Where("repository_id > ?", id).
GroupBy("repository_id").
OrderBy("repository_id ASC")

if err := sess.Limit(batchSize, 0).Find(&counts); err != nil {
return err
}
if len(counts) == 0 {
return nil
}

for _, count := range counts {
if err := f(ctx, count.RepositoryID, count.Count); err != nil {
return err
}
}
id = counts[len(counts)-1].RepositoryID
}
}

// IterateLFSMetaObjectsForRepoOptions provides options for IterateLFSMetaObjectsForRepo
type IterateLFSMetaObjectsForRepoOptions struct { type IterateLFSMetaObjectsForRepoOptions struct {
OlderThan time.Time
OlderThan time.Time
UpdatedLessRecentlyThan time.Time
OrderByUpdated bool
LoopFunctionAlwaysUpdates bool
} }


// IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo // IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo
LFSMetaObject LFSMetaObject
} }


id := int64(0)

for { for {
beans := make([]*CountLFSMetaObject, 0, batchSize) beans := make([]*CountLFSMetaObject, 0, batchSize)
// SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id
sess := engine.Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`"). sess := engine.Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`").
Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid"). Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid").
Where("`lfs_meta_object`.repository_id = ?", repoID) Where("`lfs_meta_object`.repository_id = ?", repoID)
if !opts.OlderThan.IsZero() { if !opts.OlderThan.IsZero() {
sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan) sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan)
} }
if !opts.UpdatedLessRecentlyThan.IsZero() {
sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan)
}
sess.GroupBy("`lfs_meta_object`.id") sess.GroupBy("`lfs_meta_object`.id")
if opts.OrderByUpdated {
sess.OrderBy("`lfs_meta_object`.updated_unix ASC")
} else {
sess.And("`lfs_meta_object`.id > ?", id)
sess.OrderBy("`lfs_meta_object`.id ASC")
}
if err := sess.Limit(batchSize, start).Find(&beans); err != nil { if err := sess.Limit(batchSize, start).Find(&beans); err != nil {
return err return err
} }
if len(beans) == 0 { if len(beans) == 0 {
return nil return nil
} }
start += len(beans)
if !opts.LoopFunctionAlwaysUpdates {
start += len(beans)
}


for _, bean := range beans { for _, bean := range beans {
if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil { if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil {
return err return err
} }
} }
id = beans[len(beans)-1].ID
}
}

// MarkLFSMetaObject updates the updated time for the provided LFSMetaObject
func MarkLFSMetaObject(ctx context.Context, id int64) error {
obj := &LFSMetaObject{
UpdatedUnix: timeutil.TimeStampNow(),
}
count, err := db.GetEngine(ctx).ID(id).Update(obj)
if count != 1 {
log.Error("Unexpectedly updated %d LFSMetaObjects with ID: %d", count, id)
} }
return err
} }

+ 5
- 0
models/migrations/migrations.go View File

NewMigration("Update counts of all open milestones", v1_18.UpdateOpenMilestoneCounts), NewMigration("Update counts of all open milestones", v1_18.UpdateOpenMilestoneCounts),
// v230 -> v231 // v230 -> v231
NewMigration("Add ConfidentialClient column (default true) to OAuth2Application table", v1_18.AddConfidentialClientColumnToOAuth2ApplicationTable), NewMigration("Add ConfidentialClient column (default true) to OAuth2Application table", v1_18.AddConfidentialClientColumnToOAuth2ApplicationTable),

// Gitea 1.18.0 ends at v231

// v231 -> v232 // v231 -> v232
NewMigration("Add index for hook_task", v1_19.AddIndexForHookTask), NewMigration("Add index for hook_task", v1_19.AddIndexForHookTask),
// v232 -> v233 // v232 -> v233
NewMigration("Create secrets table", v1_19.CreateSecretsTable), NewMigration("Create secrets table", v1_19.CreateSecretsTable),
// v237 -> v238 // v237 -> v238
NewMigration("Drop ForeignReference table", v1_19.DropForeignReferenceTable), NewMigration("Drop ForeignReference table", v1_19.DropForeignReferenceTable),
// v238 -> v239
NewMigration("Add updated unix to LFSMetaObject", v1_19.AddUpdatedUnixToLFSMetaObject),
} }


// GetCurrentDBVersion returns the current db version // GetCurrentDBVersion returns the current db version

+ 27
- 0
models/migrations/v1_19/v238.go View File

// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package v1_19 //nolint

import (
"code.gitea.io/gitea/modules/timeutil"

"xorm.io/xorm"
)

// AddUpdatedUnixToLFSMetaObject adds an updated column to the LFSMetaObject to allow for garbage collection
func AddUpdatedUnixToLFSMetaObject(x *xorm.Engine) error {
// Drop the table introduced in `v211`, it's considered badly designed and doesn't look like to be used.
// See: https://github.com/go-gitea/gitea/issues/21086#issuecomment-1318217453
// LFSMetaObject stores metadata for LFS tracked files.
type LFSMetaObject struct {
ID int64 `xorm:"pk autoincr"`
Oid string `json:"oid" xorm:"UNIQUE(s) INDEX NOT NULL"`
Size int64 `json:"size" xorm:"NOT NULL"`
RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
CreatedUnix timeutil.TimeStamp `xorm:"created"`
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
}

return x.Sync(new(LFSMetaObject))
}

+ 15
- 1
modules/doctor/lfs.go View File

import ( import (
"context" "context"
"fmt" "fmt"
"time"


"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/setting"
return fmt.Errorf("LFS support is disabled") return fmt.Errorf("LFS support is disabled")
} }


if err := repository.GarbageCollectLFSMetaObjects(ctx, logger, autofix); err != nil {
if err := repository.GarbageCollectLFSMetaObjects(ctx, repository.GarbageCollectLFSMetaObjectsOptions{
Logger: logger,
AutoFix: autofix,
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
// objects.
//
// It is likely that a week is potentially excessive but it should definitely be enough that any
// unassociated LFS object is genuinely unassociated.
OlderThan: time.Now().Add(-24 * time.Hour * 7),
// We don't set the UpdatedLessRecentlyThan because we want to do a full GC
}); err != nil {
return err return err
} }



+ 1
- 0
options/locale/locale_en-US.ini View File

dashboard.delete_old_actions.started = Delete all old actions from database started. dashboard.delete_old_actions.started = Delete all old actions from database started.
dashboard.update_checker = Update checker dashboard.update_checker = Update checker
dashboard.delete_old_system_notices = Delete all old system notices from database dashboard.delete_old_system_notices = Delete all old system notices from database
dashboard.gc_lfs = Garbage collect LFS meta objects


users.user_manage_panel = User Account Management users.user_manage_panel = User Account Management
users.new_account = Create User Account users.new_account = Create User Account

+ 43
- 0
services/cron/tasks_extended.go View File

}) })
} }


func registerGCLFS() {
if !setting.LFS.StartServer {
return
}
type GCLFSConfig struct {
OlderThanConfig
LastUpdatedMoreThanAgo time.Duration
NumberToCheckPerRepo int64
ProportionToCheckPerRepo float64
}

RegisterTaskFatal("gc_lfs", &GCLFSConfig{
OlderThanConfig: OlderThanConfig{
BaseConfig: BaseConfig{
Enabled: false,
RunAtStart: false,
Schedule: "@every 24h",
},
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
// objects.
//
// It is likely that a week is potentially excessive but it should definitely be enough that any
// unassociated LFS object is genuinely unassociated.
OlderThan: 24 * time.Hour * 7,
},
// Only GC things that haven't been looked at in the past 3 days
LastUpdatedMoreThanAgo: 24 * time.Hour * 3,
NumberToCheckPerRepo: 100,
ProportionToCheckPerRepo: 0.6,
}, func(ctx context.Context, _ *user_model.User, config Config) error {
gcLFSConfig := config.(*GCLFSConfig)
return repo_service.GarbageCollectLFSMetaObjects(ctx, repo_service.GarbageCollectLFSMetaObjectsOptions{
AutoFix: true,
OlderThan: time.Now().Add(-gcLFSConfig.OlderThan),
UpdatedLessRecentlyThan: time.Now().Add(-gcLFSConfig.LastUpdatedMoreThanAgo),
})
})
}

func initExtendedTasks() { func initExtendedTasks() {
registerDeleteInactiveUsers() registerDeleteInactiveUsers()
registerDeleteRepositoryArchives() registerDeleteRepositoryArchives()
registerDeleteOldActions() registerDeleteOldActions()
registerUpdateGiteaChecker() registerUpdateGiteaChecker()
registerDeleteOldSystemNotices() registerDeleteOldSystemNotices()
registerGCLFS()
} }

+ 62
- 27
services/repository/lfs.go View File



import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"time" "time"


"code.gitea.io/gitea/models/db"
git_model "code.gitea.io/gitea/models/git" git_model "code.gitea.io/gitea/models/git"
repo_model "code.gitea.io/gitea/models/repo" repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/lfs" "code.gitea.io/gitea/modules/lfs"
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"

"xorm.io/builder"
"code.gitea.io/gitea/modules/setting"
) )


func GarbageCollectLFSMetaObjects(ctx context.Context, logger log.Logger, autofix bool) error {
// GarbageCollectLFSMetaObjectsOptions provides options for GarbageCollectLFSMetaObjects function
type GarbageCollectLFSMetaObjectsOptions struct {
Logger log.Logger
AutoFix bool
OlderThan time.Time
UpdatedLessRecentlyThan time.Time
NumberToCheckPerRepo int64
ProportionToCheckPerRepo float64
}

// GarbageCollectLFSMetaObjects garbage collects LFS objects for all repositories
func GarbageCollectLFSMetaObjects(ctx context.Context, opts GarbageCollectLFSMetaObjectsOptions) error {
log.Trace("Doing: GarbageCollectLFSMetaObjects") log.Trace("Doing: GarbageCollectLFSMetaObjects")
defer log.Trace("Finished: GarbageCollectLFSMetaObjects")


if err := db.Iterate(
ctx,
builder.And(builder.Gt{"id": 0}),
func(ctx context.Context, repo *repo_model.Repository) error {
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, logger, autofix)
},
); err != nil {
return err
if !setting.LFS.StartServer {
if opts.Logger != nil {
opts.Logger.Info("LFS support is disabled")
}
return nil
} }


log.Trace("Finished: GarbageCollectLFSMetaObjects")
return nil
return git_model.IterateRepositoryIDsWithLFSMetaObjects(ctx, func(ctx context.Context, repoID, count int64) error {
repo, err := repo_model.GetRepositoryByID(ctx, repoID)
if err != nil {
return err
}

if newMinimum := int64(float64(count) * opts.ProportionToCheckPerRepo); newMinimum > opts.NumberToCheckPerRepo && opts.NumberToCheckPerRepo != 0 {
opts.NumberToCheckPerRepo = newMinimum
}
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts)
})
} }


func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, logger log.Logger, autofix bool) error {
if logger != nil {
logger.Info("Checking %-v", repo)
// GarbageCollectLFSMetaObjectsForRepo garbage collects LFS objects for a specific repository
func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, opts GarbageCollectLFSMetaObjectsOptions) error {
if opts.Logger != nil {
opts.Logger.Info("Checking %-v", repo)
} }
total, orphaned, collected, deleted := 0, 0, 0, 0
if logger != nil {
total, orphaned, collected, deleted := int64(0), 0, 0, 0
if opts.Logger != nil {
defer func() { defer func() {
if orphaned == 0 { if orphaned == 0 {
logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
} else if !autofix {
logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
opts.Logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
} else if !opts.AutoFix {
opts.Logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
} else { } else {
logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
opts.Logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
} }
}() }()
} }
defer gitRepo.Close() defer gitRepo.Close()


store := lfs.NewContentStore() store := lfs.NewContentStore()
errStop := errors.New("STOPERR")


return git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
err = git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
if opts.NumberToCheckPerRepo > 0 && total > opts.NumberToCheckPerRepo {
return errStop
}
total++ total++
pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent())) pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent()))


if gitRepo.IsObjectExist(pointerSha.String()) { if gitRepo.IsObjectExist(pointerSha.String()) {
return nil
return git_model.MarkLFSMetaObject(ctx, metaObject.ID)
} }
orphaned++ orphaned++


if !autofix {
if !opts.AutoFix {
return nil return nil
} }
// Non-existent pointer file // Non-existent pointer file
// //
// It is likely that a week is potentially excessive but it should definitely be enough that any // It is likely that a week is potentially excessive but it should definitely be enough that any
// unassociated LFS object is genuinely unassociated. // unassociated LFS object is genuinely unassociated.
OlderThan: time.Now().Add(-24 * 7 * time.Hour),
OlderThan: opts.OlderThan,
UpdatedLessRecentlyThan: opts.UpdatedLessRecentlyThan,
OrderByUpdated: true,
LoopFunctionAlwaysUpdates: true,
}) })

if err == errStop {
if opts.Logger != nil {
opts.Logger.Info("Processing stopped at %d total LFSMetaObjects in %-v", total, repo)
}
return nil
} else if err != nil {
return err
}
return nil
} }

Loading…
Cancel
Save