aboutsummaryrefslogtreecommitdiffstats
path: root/modules
diff options
context:
space:
mode:
authorwxiaoguang <wxiaoguang@gmail.com>2023-05-29 10:52:32 +0800
committerGitHub <noreply@github.com>2023-05-29 10:52:32 +0800
commit84c8ab9fd109145d04d0c58cadd46160f1ee9263 (patch)
treeeebfb67c7414bae68b08ba3adca2dbb1f68a61cc /modules
parent8faf9465b3913137d3470151a678857c319625a5 (diff)
downloadgitea-84c8ab9fd109145d04d0c58cadd46160f1ee9263.tar.gz
gitea-84c8ab9fd109145d04d0c58cadd46160f1ee9263.zip
Help to recover from corrupted levelqueue (#24912)
gitea.com experienced the corrupted LevelQueue bug again. I think the problem is clear now: if the keys in LevelDB went out-of-sync, the LevelQueue itself doesn't have the ability to recover, eg: * LevelQueue.Len() reports 100 * LevelQueue.LPop() reports ErrNotFound = errors.New("no key found") So it needs to dive into the LevelDB to remove all keys to recover the corrupted LevelQueue. More comments are in TestCorruptedLevelQueue.
Diffstat (limited to 'modules')
-rw-r--r--modules/queue/base_levelqueue.go39
-rw-r--r--modules/queue/base_levelqueue_common.go17
-rw-r--r--modules/queue/base_levelqueue_test.go55
-rw-r--r--modules/queue/base_levelqueue_unique.go58
-rw-r--r--modules/queue/lqinternal/lqinternal.go48
5 files changed, 162 insertions, 55 deletions
diff --git a/modules/queue/base_levelqueue.go b/modules/queue/base_levelqueue.go
index afde502116..efc57c9c9c 100644
--- a/modules/queue/base_levelqueue.go
+++ b/modules/queue/base_levelqueue.go
@@ -5,16 +5,21 @@ package queue
import (
"context"
+ "sync/atomic"
"code.gitea.io/gitea/modules/nosql"
+ "code.gitea.io/gitea/modules/queue/lqinternal"
"gitea.com/lunny/levelqueue"
+ "github.com/syndtr/goleveldb/leveldb"
)
type baseLevelQueue struct {
- internal *levelqueue.Queue
- conn string
- cfg *BaseConfig
+ internal atomic.Pointer[levelqueue.Queue]
+
+ conn string
+ cfg *BaseConfig
+ db *leveldb.DB
}
var _ baseQueue = (*baseLevelQueue)(nil)
@@ -31,21 +36,23 @@ func newBaseLevelQueueSimple(cfg *BaseConfig) (baseQueue, error) {
if err != nil {
return nil, err
}
- q := &baseLevelQueue{conn: conn, cfg: cfg}
- q.internal, err = levelqueue.NewQueue(db, []byte(cfg.QueueFullName), false)
+ q := &baseLevelQueue{conn: conn, cfg: cfg, db: db}
+ lq, err := levelqueue.NewQueue(db, []byte(cfg.QueueFullName), false)
if err != nil {
return nil, err
}
-
+ q.internal.Store(lq)
return q, nil
}
func (q *baseLevelQueue) PushItem(ctx context.Context, data []byte) error {
- return baseLevelQueueCommon(q.cfg, q.internal, nil).PushItem(ctx, data)
+ c := baseLevelQueueCommon(q.cfg, nil, func() baseLevelQueuePushPoper { return q.internal.Load() })
+ return c.PushItem(ctx, data)
}
func (q *baseLevelQueue) PopItem(ctx context.Context) ([]byte, error) {
- return baseLevelQueueCommon(q.cfg, q.internal, nil).PopItem(ctx)
+ c := baseLevelQueueCommon(q.cfg, nil, func() baseLevelQueuePushPoper { return q.internal.Load() })
+ return c.PopItem(ctx)
}
func (q *baseLevelQueue) HasItem(ctx context.Context, data []byte) (bool, error) {
@@ -53,20 +60,24 @@ func (q *baseLevelQueue) HasItem(ctx context.Context, data []byte) (bool, error)
}
func (q *baseLevelQueue) Len(ctx context.Context) (int, error) {
- return int(q.internal.Len()), nil
+ return int(q.internal.Load().Len()), nil
}
func (q *baseLevelQueue) Close() error {
- err := q.internal.Close()
+ err := q.internal.Load().Close()
_ = nosql.GetManager().CloseLevelDB(q.conn)
+ q.db = nil // the db is not managed by us, it's managed by the nosql manager
return err
}
func (q *baseLevelQueue) RemoveAll(ctx context.Context) error {
- for q.internal.Len() > 0 {
- if _, err := q.internal.LPop(); err != nil {
- return err
- }
+ lqinternal.RemoveLevelQueueKeys(q.db, []byte(q.cfg.QueueFullName))
+ lq, err := levelqueue.NewQueue(q.db, []byte(q.cfg.QueueFullName), false)
+ if err != nil {
+ return err
}
+ old := q.internal.Load()
+ q.internal.Store(lq)
+ _ = old.Close() // Not ideal for concurrency. Luckily, the levelqueue only sets its db=nil because it doesn't manage the db, so far so good
return nil
}
diff --git a/modules/queue/base_levelqueue_common.go b/modules/queue/base_levelqueue_common.go
index 409a965517..78d3b85a8a 100644
--- a/modules/queue/base_levelqueue_common.go
+++ b/modules/queue/base_levelqueue_common.go
@@ -17,6 +17,7 @@ import (
"github.com/syndtr/goleveldb/leveldb"
)
+// baseLevelQueuePushPoper is the common interface for levelqueue.Queue and levelqueue.UniqueQueue
type baseLevelQueuePushPoper interface {
RPush(data []byte) error
LPop() ([]byte, error)
@@ -24,9 +25,9 @@ type baseLevelQueuePushPoper interface {
}
type baseLevelQueueCommonImpl struct {
- length int
- internal baseLevelQueuePushPoper
- mu *sync.Mutex
+ length int
+ internalFunc func() baseLevelQueuePushPoper
+ mu *sync.Mutex
}
func (q *baseLevelQueueCommonImpl) PushItem(ctx context.Context, data []byte) error {
@@ -36,11 +37,11 @@ func (q *baseLevelQueueCommonImpl) PushItem(ctx context.Context, data []byte) er
defer q.mu.Unlock()
}
- cnt := int(q.internal.Len())
+ cnt := int(q.internalFunc().Len())
if cnt >= q.length {
return true, nil
}
- retry, err = false, q.internal.RPush(data)
+ retry, err = false, q.internalFunc().RPush(data)
if err == levelqueue.ErrAlreadyInQueue {
err = ErrAlreadyInQueue
}
@@ -55,7 +56,7 @@ func (q *baseLevelQueueCommonImpl) PopItem(ctx context.Context) ([]byte, error)
defer q.mu.Unlock()
}
- data, err = q.internal.LPop()
+ data, err = q.internalFunc().LPop()
if err == levelqueue.ErrNotFound {
return true, nil, nil
}
@@ -66,8 +67,8 @@ func (q *baseLevelQueueCommonImpl) PopItem(ctx context.Context) ([]byte, error)
})
}
-func baseLevelQueueCommon(cfg *BaseConfig, internal baseLevelQueuePushPoper, mu *sync.Mutex) *baseLevelQueueCommonImpl {
- return &baseLevelQueueCommonImpl{length: cfg.Length, internal: internal}
+func baseLevelQueueCommon(cfg *BaseConfig, mu *sync.Mutex, internalFunc func() baseLevelQueuePushPoper) *baseLevelQueueCommonImpl {
+ return &baseLevelQueueCommonImpl{length: cfg.Length, mu: mu, internalFunc: internalFunc}
}
func prepareLevelDB(cfg *BaseConfig) (conn string, db *leveldb.DB, err error) {
diff --git a/modules/queue/base_levelqueue_test.go b/modules/queue/base_levelqueue_test.go
index 712a0892cd..b881802ca2 100644
--- a/modules/queue/base_levelqueue_test.go
+++ b/modules/queue/base_levelqueue_test.go
@@ -6,9 +6,12 @@ package queue
import (
"testing"
+ "code.gitea.io/gitea/modules/queue/lqinternal"
"code.gitea.io/gitea/modules/setting"
+ "gitea.com/lunny/levelqueue"
"github.com/stretchr/testify/assert"
+ "github.com/syndtr/goleveldb/leveldb"
)
func TestBaseLevelDB(t *testing.T) {
@@ -21,3 +24,55 @@ func TestBaseLevelDB(t *testing.T) {
testQueueBasic(t, newBaseLevelQueueSimple, toBaseConfig("baseLevelQueue", setting.QueueSettings{Datadir: t.TempDir() + "/queue-test", Length: 10}), false)
testQueueBasic(t, newBaseLevelQueueUnique, toBaseConfig("baseLevelQueueUnique", setting.QueueSettings{ConnStr: "leveldb://" + t.TempDir() + "/queue-test", Length: 10}), true)
}
+
+func TestCorruptedLevelQueue(t *testing.T) {
+ // sometimes the levelqueue could be in a corrupted state, this test is to make sure it can recover from it
+ dbDir := t.TempDir() + "/levelqueue-test"
+ db, err := leveldb.OpenFile(dbDir, nil)
+ if !assert.NoError(t, err) {
+ return
+ }
+ defer db.Close()
+
+ assert.NoError(t, db.Put([]byte("other-key"), []byte("other-value"), nil))
+
+ nameQueuePrefix := []byte("queue_name")
+ nameSetPrefix := []byte("set_name")
+ lq, err := levelqueue.NewUniqueQueue(db, nameQueuePrefix, nameSetPrefix, false)
+ assert.NoError(t, err)
+ assert.NoError(t, lq.RPush([]byte("item-1")))
+
+ itemKey := lqinternal.QueueItemKeyBytes(nameQueuePrefix, 1)
+ itemValue, err := db.Get(itemKey, nil)
+ assert.NoError(t, err)
+ assert.Equal(t, []byte("item-1"), itemValue)
+
+ // there should be 5 keys in db: queue low, queue high, 1 queue item, 1 set item, and "other-key"
+ keys := lqinternal.ListLevelQueueKeys(db)
+ assert.Len(t, keys, 5)
+
+ // delete the queue item key, to corrupt the queue
+ assert.NoError(t, db.Delete(itemKey, nil))
+ // now the queue is corrupted, it never works again
+ _, err = lq.LPop()
+ assert.ErrorIs(t, err, levelqueue.ErrNotFound)
+ assert.NoError(t, lq.Close())
+
+ // remove all the queue related keys to reset the queue
+ lqinternal.RemoveLevelQueueKeys(db, nameQueuePrefix)
+ lqinternal.RemoveLevelQueueKeys(db, nameSetPrefix)
+ // now there should be only 1 key in db: "other-key"
+ keys = lqinternal.ListLevelQueueKeys(db)
+ assert.Len(t, keys, 1)
+ assert.Equal(t, []byte("other-key"), keys[0])
+
+ // re-create a queue from db
+ lq, err = levelqueue.NewUniqueQueue(db, nameQueuePrefix, nameSetPrefix, false)
+ assert.NoError(t, err)
+ assert.NoError(t, lq.RPush([]byte("item-new-1")))
+ // now the queue works again
+ itemValue, err = lq.LPop()
+ assert.NoError(t, err)
+ assert.Equal(t, []byte("item-new-1"), itemValue)
+ assert.NoError(t, lq.Close())
+}
diff --git a/modules/queue/base_levelqueue_unique.go b/modules/queue/base_levelqueue_unique.go
index 1acd504e32..968a4e98d4 100644
--- a/modules/queue/base_levelqueue_unique.go
+++ b/modules/queue/base_levelqueue_unique.go
@@ -6,18 +6,21 @@ package queue
import (
"context"
"sync"
- "unsafe"
+ "sync/atomic"
"code.gitea.io/gitea/modules/nosql"
+ "code.gitea.io/gitea/modules/queue/lqinternal"
"gitea.com/lunny/levelqueue"
"github.com/syndtr/goleveldb/leveldb"
)
type baseLevelQueueUnique struct {
- internal *levelqueue.UniqueQueue
- conn string
- cfg *BaseConfig
+ internal atomic.Pointer[levelqueue.UniqueQueue]
+
+ conn string
+ cfg *BaseConfig
+ db *leveldb.DB
mu sync.Mutex // the levelqueue.UniqueQueue is not thread-safe, there is no mutex protecting the underlying queue&set together
}
@@ -29,39 +32,42 @@ func newBaseLevelQueueUnique(cfg *BaseConfig) (baseQueue, error) {
if err != nil {
return nil, err
}
- q := &baseLevelQueueUnique{conn: conn, cfg: cfg}
- q.internal, err = levelqueue.NewUniqueQueue(db, []byte(cfg.QueueFullName), []byte(cfg.SetFullName), false)
+ q := &baseLevelQueueUnique{conn: conn, cfg: cfg, db: db}
+ lq, err := levelqueue.NewUniqueQueue(db, []byte(cfg.QueueFullName), []byte(cfg.SetFullName), false)
if err != nil {
return nil, err
}
-
+ q.internal.Store(lq)
return q, nil
}
func (q *baseLevelQueueUnique) PushItem(ctx context.Context, data []byte) error {
- return baseLevelQueueCommon(q.cfg, q.internal, &q.mu).PushItem(ctx, data)
+ c := baseLevelQueueCommon(q.cfg, &q.mu, func() baseLevelQueuePushPoper { return q.internal.Load() })
+ return c.PushItem(ctx, data)
}
func (q *baseLevelQueueUnique) PopItem(ctx context.Context) ([]byte, error) {
- return baseLevelQueueCommon(q.cfg, q.internal, &q.mu).PopItem(ctx)
+ c := baseLevelQueueCommon(q.cfg, &q.mu, func() baseLevelQueuePushPoper { return q.internal.Load() })
+ return c.PopItem(ctx)
}
func (q *baseLevelQueueUnique) HasItem(ctx context.Context, data []byte) (bool, error) {
q.mu.Lock()
defer q.mu.Unlock()
- return q.internal.Has(data)
+ return q.internal.Load().Has(data)
}
func (q *baseLevelQueueUnique) Len(ctx context.Context) (int, error) {
q.mu.Lock()
defer q.mu.Unlock()
- return int(q.internal.Len()), nil
+ return int(q.internal.Load().Len()), nil
}
func (q *baseLevelQueueUnique) Close() error {
q.mu.Lock()
defer q.mu.Unlock()
- err := q.internal.Close()
+ err := q.internal.Load().Close()
+ q.db = nil // the db is not managed by us, it's managed by the nosql manager
_ = nosql.GetManager().CloseLevelDB(q.conn)
return err
}
@@ -69,28 +75,14 @@ func (q *baseLevelQueueUnique) Close() error {
func (q *baseLevelQueueUnique) RemoveAll(ctx context.Context) error {
q.mu.Lock()
defer q.mu.Unlock()
-
- type levelUniqueQueue struct {
- q *levelqueue.Queue
- set *levelqueue.Set
- db *leveldb.DB
- }
- lq := (*levelUniqueQueue)(unsafe.Pointer(q.internal))
-
- for lq.q.Len() > 0 {
- if _, err := lq.q.LPop(); err != nil {
- return err
- }
- }
-
- // the "set" must be cleared after the "list" because there is no transaction.
- // it's better to have duplicate items than losing items.
- members, err := lq.set.Members()
+ lqinternal.RemoveLevelQueueKeys(q.db, []byte(q.cfg.QueueFullName))
+ lqinternal.RemoveLevelQueueKeys(q.db, []byte(q.cfg.SetFullName))
+ lq, err := levelqueue.NewUniqueQueue(q.db, []byte(q.cfg.QueueFullName), []byte(q.cfg.SetFullName), false)
if err != nil {
- return err // seriously corrupted
- }
- for _, v := range members {
- _, _ = lq.set.Remove(v)
+ return err
}
+ old := q.internal.Load()
+ q.internal.Store(lq)
+ _ = old.Close() // Not ideal for concurrency. Luckily, the levelqueue only sets its db=nil because it doesn't manage the db, so far so good
return nil
}
diff --git a/modules/queue/lqinternal/lqinternal.go b/modules/queue/lqinternal/lqinternal.go
new file mode 100644
index 0000000000..89aa4e5989
--- /dev/null
+++ b/modules/queue/lqinternal/lqinternal.go
@@ -0,0 +1,48 @@
+// Copyright 2023 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package lqinternal
+
+import (
+ "bytes"
+ "encoding/binary"
+
+ "github.com/syndtr/goleveldb/leveldb"
+ "github.com/syndtr/goleveldb/leveldb/opt"
+)
+
+func QueueItemIDBytes(id int64) []byte {
+ buf := make([]byte, 8)
+ binary.PutVarint(buf, id)
+ return buf
+}
+
+func QueueItemKeyBytes(prefix []byte, id int64) []byte {
+ key := make([]byte, len(prefix), len(prefix)+1+8)
+ copy(key, prefix)
+ key = append(key, '-')
+ return append(key, QueueItemIDBytes(id)...)
+}
+
+func RemoveLevelQueueKeys(db *leveldb.DB, namePrefix []byte) {
+ keyPrefix := make([]byte, len(namePrefix)+1)
+ copy(keyPrefix, namePrefix)
+ keyPrefix[len(namePrefix)] = '-'
+
+ it := db.NewIterator(nil, &opt.ReadOptions{Strict: opt.NoStrict})
+ defer it.Release()
+ for it.Next() {
+ if bytes.HasPrefix(it.Key(), keyPrefix) {
+ _ = db.Delete(it.Key(), nil)
+ }
+ }
+}
+
+func ListLevelQueueKeys(db *leveldb.DB) (res [][]byte) {
+ it := db.NewIterator(nil, &opt.ReadOptions{Strict: opt.NoStrict})
+ defer it.Release()
+ for it.Next() {
+ res = append(res, it.Key())
+ }
+ return res
+}