From 27e49cd01cf33c7adfd7e1a897d95884f7714aca Mon Sep 17 00:00:00 2001 From: zeripath Date: Tue, 28 Feb 2023 22:55:43 +0000 Subject: Properly flush unique queues on startup (#23154) There have been a number of reports of PRs being blocked whilst being checked which have been difficult to debug. In investigating #23050 I have realised that whilst the Warn there is somewhat of a miscall there was a real bug in the way that the LevelUniqueQueue was being restored on start-up of the PersistableChannelUniqueQueue. Next there is a conflict in the setting of the internal leveldb queue name - This wasn't being set so it was being overridden by other unique queues. This PR fixes these bugs and adds a testcase. Thanks to @brechtvl for noticing the second issue. Fix #23050 and others --------- Signed-off-by: Andrew Thornton Co-authored-by: techknowlogick --- modules/queue/unique_queue_disk_channel.go | 41 ++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 8 deletions(-) (limited to 'modules/queue/unique_queue_disk_channel.go') diff --git a/modules/queue/unique_queue_disk_channel.go b/modules/queue/unique_queue_disk_channel.go index 405726182d..cc8a807c67 100644 --- a/modules/queue/unique_queue_disk_channel.go +++ b/modules/queue/unique_queue_disk_channel.go @@ -94,7 +94,8 @@ func NewPersistableChannelUniqueQueue(handle HandlerFunc, cfg, exemplar interfac }, Workers: 0, }, - DataDir: config.DataDir, + DataDir: config.DataDir, + QueueName: config.Name + "-level", } queue.channelQueue = channelUniqueQueue.(*ChannelUniqueQueue) @@ -209,17 +210,29 @@ func (q *PersistableChannelUniqueQueue) Run(atShutdown, atTerminate func(func()) atTerminate(q.Terminate) _ = q.channelQueue.AddWorkers(q.channelQueue.workers, 0) - if luq, ok := q.internal.(*LevelUniqueQueue); ok && luq.ByteFIFOUniqueQueue.byteFIFO.Len(luq.shutdownCtx) != 0 { + if luq, ok := q.internal.(*LevelUniqueQueue); ok && !luq.IsEmpty() { // Just run the level queue - we shut it down once it's flushed - go q.internal.Run(func(_ func()) {}, func(_ func()) {}) + go luq.Run(func(_ func()) {}, func(_ func()) {}) go func() { - _ = q.internal.Flush(0) - log.Debug("LevelUniqueQueue: %s flushed so shutting down", q.internal.(*LevelUniqueQueue).Name()) - q.internal.(*LevelUniqueQueue).Shutdown() - GetManager().Remove(q.internal.(*LevelUniqueQueue).qid) + _ = luq.Flush(0) + for !luq.IsEmpty() { + _ = luq.Flush(0) + select { + case <-time.After(100 * time.Millisecond): + case <-luq.shutdownCtx.Done(): + if luq.byteFIFO.Len(luq.terminateCtx) > 0 { + log.Warn("LevelUniqueQueue: %s shut down before completely flushed", luq.Name()) + } + return + } + } + log.Debug("LevelUniqueQueue: %s flushed so shutting down", luq.Name()) + luq.Shutdown() + GetManager().Remove(luq.qid) }() } else { log.Debug("PersistableChannelUniqueQueue: %s Skipping running the empty level queue", q.delayedStarter.name) + _ = q.internal.Flush(0) q.internal.(*LevelUniqueQueue).Shutdown() GetManager().Remove(q.internal.(*LevelUniqueQueue).qid) } @@ -285,8 +298,20 @@ func (q *PersistableChannelUniqueQueue) Shutdown() { // Redirect all remaining data in the chan to the internal channel close(q.channelQueue.dataChan) log.Trace("PersistableChannelUniqueQueue: %s Redirecting remaining data", q.delayedStarter.name) + countOK, countLost := 0, 0 for data := range q.channelQueue.dataChan { - _ = q.internal.Push(data) + err := q.internal.(*LevelUniqueQueue).Push(data) + if err != nil { + log.Error("PersistableChannelUniqueQueue: %s Unable redirect %v due to: %v", q.delayedStarter.name, data, err) + countLost++ + } else { + countOK++ + } + } + if countLost > 0 { + log.Warn("PersistableChannelUniqueQueue: %s %d will be restored on restart, %d lost", q.delayedStarter.name, countOK, countLost) + } else if countOK > 0 { + log.Warn("PersistableChannelUniqueQueue: %s %d will be restored on restart", q.delayedStarter.name, countOK) } log.Trace("PersistableChannelUniqueQueue: %s Done Redirecting remaining data", q.delayedStarter.name) -- cgit v1.2.3