You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

persister.go 27KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package scorch
  15. import (
  16. "bytes"
  17. "encoding/binary"
  18. "encoding/json"
  19. "fmt"
  20. "io/ioutil"
  21. "log"
  22. "math"
  23. "os"
  24. "path/filepath"
  25. "strconv"
  26. "strings"
  27. "sync/atomic"
  28. "time"
  29. "github.com/RoaringBitmap/roaring"
  30. "github.com/blevesearch/bleve/index"
  31. "github.com/blevesearch/bleve/index/scorch/segment"
  32. bolt "go.etcd.io/bbolt"
  33. )
  34. // DefaultPersisterNapTimeMSec is kept to zero as this helps in direct
  35. // persistence of segments with the default safe batch option.
  36. // If the default safe batch option results in high number of
  37. // files on disk, then users may initialise this configuration parameter
  38. // with higher values so that the persister will nap a bit within it's
  39. // work loop to favour better in-memory merging of segments to result
  40. // in fewer segment files on disk. But that may come with an indexing
  41. // performance overhead.
  42. // Unsafe batch users are advised to override this to higher value
  43. // for better performance especially with high data density.
  44. var DefaultPersisterNapTimeMSec int = 0 // ms
  45. // DefaultPersisterNapUnderNumFiles helps in controlling the pace of
  46. // persister. At times of a slow merger progress with heavy file merging
  47. // operations, its better to pace down the persister for letting the merger
  48. // to catch up within a range defined by this parameter.
  49. // Fewer files on disk (as per the merge plan) would result in keeping the
  50. // file handle usage under limit, faster disk merger and a healthier index.
  51. // Its been observed that such a loosely sync'ed introducer-persister-merger
  52. // trio results in better overall performance.
  53. var DefaultPersisterNapUnderNumFiles int = 1000
  54. var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64
  55. type persisterOptions struct {
  56. // PersisterNapTimeMSec controls the wait/delay injected into
  57. // persistence workloop to improve the chances for
  58. // a healthier and heavier in-memory merging
  59. PersisterNapTimeMSec int
  60. // PersisterNapTimeMSec > 0, and the number of files is less than
  61. // PersisterNapUnderNumFiles, then the persister will sleep
  62. // PersisterNapTimeMSec amount of time to improve the chances for
  63. // a healthier and heavier in-memory merging
  64. PersisterNapUnderNumFiles int
  65. // MemoryPressurePauseThreshold let persister to have a better leeway
  66. // for prudently performing the memory merge of segments on a memory
  67. // pressure situation. Here the config value is an upper threshold
  68. // for the number of paused application threads. The default value would
  69. // be a very high number to always favour the merging of memory segments.
  70. MemoryPressurePauseThreshold uint64
  71. }
  72. type notificationChan chan struct{}
  73. func (s *Scorch) persisterLoop() {
  74. defer s.asyncTasks.Done()
  75. var persistWatchers []*epochWatcher
  76. var lastPersistedEpoch, lastMergedEpoch uint64
  77. var ew *epochWatcher
  78. var unpersistedCallbacks []index.BatchCallback
  79. po, err := s.parsePersisterOptions()
  80. if err != nil {
  81. s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err))
  82. s.asyncTasks.Done()
  83. return
  84. }
  85. OUTER:
  86. for {
  87. atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1)
  88. select {
  89. case <-s.closeCh:
  90. break OUTER
  91. case ew = <-s.persisterNotifier:
  92. persistWatchers = append(persistWatchers, ew)
  93. default:
  94. }
  95. if ew != nil && ew.epoch > lastMergedEpoch {
  96. lastMergedEpoch = ew.epoch
  97. }
  98. lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
  99. lastMergedEpoch, persistWatchers, po)
  100. var ourSnapshot *IndexSnapshot
  101. var ourPersisted []chan error
  102. var ourPersistedCallbacks []index.BatchCallback
  103. // check to see if there is a new snapshot to persist
  104. s.rootLock.Lock()
  105. if s.root != nil && s.root.epoch > lastPersistedEpoch {
  106. ourSnapshot = s.root
  107. ourSnapshot.AddRef()
  108. ourPersisted = s.rootPersisted
  109. s.rootPersisted = nil
  110. ourPersistedCallbacks = s.persistedCallbacks
  111. s.persistedCallbacks = nil
  112. atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size()))
  113. atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch)
  114. }
  115. s.rootLock.Unlock()
  116. if ourSnapshot != nil {
  117. startTime := time.Now()
  118. err := s.persistSnapshot(ourSnapshot, po)
  119. for _, ch := range ourPersisted {
  120. if err != nil {
  121. ch <- err
  122. }
  123. close(ch)
  124. }
  125. if err != nil {
  126. atomic.StoreUint64(&s.iStats.persistEpoch, 0)
  127. if err == segment.ErrClosed {
  128. // index has been closed
  129. _ = ourSnapshot.DecRef()
  130. break OUTER
  131. }
  132. // save this current snapshot's persistedCallbacks, to invoke during
  133. // the retry attempt
  134. unpersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...)
  135. s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
  136. _ = ourSnapshot.DecRef()
  137. atomic.AddUint64(&s.stats.TotPersistLoopErr, 1)
  138. continue OUTER
  139. }
  140. if unpersistedCallbacks != nil {
  141. // in the event of this being a retry attempt for persisting a snapshot
  142. // that had earlier failed, prepend the persistedCallbacks associated
  143. // with earlier segment(s) to the latest persistedCallbacks
  144. ourPersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...)
  145. unpersistedCallbacks = nil
  146. }
  147. for i := range ourPersistedCallbacks {
  148. ourPersistedCallbacks[i](err)
  149. }
  150. atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch)
  151. lastPersistedEpoch = ourSnapshot.epoch
  152. for _, ew := range persistWatchers {
  153. close(ew.notifyCh)
  154. }
  155. persistWatchers = nil
  156. _ = ourSnapshot.DecRef()
  157. changed := false
  158. s.rootLock.RLock()
  159. if s.root != nil && s.root.epoch != lastPersistedEpoch {
  160. changed = true
  161. }
  162. s.rootLock.RUnlock()
  163. s.fireEvent(EventKindPersisterProgress, time.Since(startTime))
  164. if changed {
  165. atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1)
  166. continue OUTER
  167. }
  168. }
  169. // tell the introducer we're waiting for changes
  170. w := &epochWatcher{
  171. epoch: lastPersistedEpoch,
  172. notifyCh: make(notificationChan, 1),
  173. }
  174. select {
  175. case <-s.closeCh:
  176. break OUTER
  177. case s.introducerNotifier <- w:
  178. }
  179. s.removeOldData() // might as well cleanup while waiting
  180. atomic.AddUint64(&s.stats.TotPersistLoopWait, 1)
  181. select {
  182. case <-s.closeCh:
  183. break OUTER
  184. case <-w.notifyCh:
  185. // woken up, next loop should pick up work
  186. atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1)
  187. case ew = <-s.persisterNotifier:
  188. // if the watchers are already caught up then let them wait,
  189. // else let them continue to do the catch up
  190. persistWatchers = append(persistWatchers, ew)
  191. }
  192. atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1)
  193. }
  194. }
  195. func notifyMergeWatchers(lastPersistedEpoch uint64,
  196. persistWatchers []*epochWatcher) []*epochWatcher {
  197. var watchersNext []*epochWatcher
  198. for _, w := range persistWatchers {
  199. if w.epoch < lastPersistedEpoch {
  200. close(w.notifyCh)
  201. } else {
  202. watchersNext = append(watchersNext, w)
  203. }
  204. }
  205. return watchersNext
  206. }
  207. func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64,
  208. lastMergedEpoch uint64, persistWatchers []*epochWatcher,
  209. po *persisterOptions) (uint64, []*epochWatcher) {
  210. // First, let the watchers proceed if they lag behind
  211. persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
  212. // Check the merger lag by counting the segment files on disk,
  213. numFilesOnDisk, _, _ := s.diskFileStats(nil)
  214. // On finding fewer files on disk, persister takes a short pause
  215. // for sufficient in-memory segments to pile up for the next
  216. // memory merge cum persist loop.
  217. if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) &&
  218. po.PersisterNapTimeMSec > 0 && s.paused() == 0 {
  219. select {
  220. case <-s.closeCh:
  221. case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)):
  222. atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1)
  223. case ew := <-s.persisterNotifier:
  224. // unblock the merger in meantime
  225. persistWatchers = append(persistWatchers, ew)
  226. lastMergedEpoch = ew.epoch
  227. persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
  228. atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1)
  229. }
  230. return lastMergedEpoch, persistWatchers
  231. }
  232. // Finding too many files on disk could be due to two reasons.
  233. // 1. Too many older snapshots awaiting the clean up.
  234. // 2. The merger could be lagging behind on merging the disk files.
  235. if numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) {
  236. s.removeOldData()
  237. numFilesOnDisk, _, _ = s.diskFileStats(nil)
  238. }
  239. // Persister pause until the merger catches up to reduce the segment
  240. // file count under the threshold.
  241. // But if there is memory pressure, then skip this sleep maneuvers.
  242. OUTER:
  243. for po.PersisterNapUnderNumFiles > 0 &&
  244. numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) &&
  245. lastMergedEpoch < lastPersistedEpoch {
  246. atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1)
  247. select {
  248. case <-s.closeCh:
  249. break OUTER
  250. case ew := <-s.persisterNotifier:
  251. persistWatchers = append(persistWatchers, ew)
  252. lastMergedEpoch = ew.epoch
  253. }
  254. atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1)
  255. // let the watchers proceed if they lag behind
  256. persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
  257. numFilesOnDisk, _, _ = s.diskFileStats(nil)
  258. }
  259. return lastMergedEpoch, persistWatchers
  260. }
  261. func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) {
  262. po := persisterOptions{
  263. PersisterNapTimeMSec: DefaultPersisterNapTimeMSec,
  264. PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles,
  265. MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold,
  266. }
  267. if v, ok := s.config["scorchPersisterOptions"]; ok {
  268. b, err := json.Marshal(v)
  269. if err != nil {
  270. return &po, err
  271. }
  272. err = json.Unmarshal(b, &po)
  273. if err != nil {
  274. return &po, err
  275. }
  276. }
  277. return &po, nil
  278. }
  279. func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot,
  280. po *persisterOptions) error {
  281. // Perform in-memory segment merging only when the memory pressure is
  282. // below the configured threshold, else the persister performs the
  283. // direct persistence of segments.
  284. if s.paused() < po.MemoryPressurePauseThreshold {
  285. persisted, err := s.persistSnapshotMaybeMerge(snapshot)
  286. if err != nil {
  287. return err
  288. }
  289. if persisted {
  290. return nil
  291. }
  292. }
  293. return s.persistSnapshotDirect(snapshot)
  294. }
  295. // DefaultMinSegmentsForInMemoryMerge represents the default number of
  296. // in-memory zap segments that persistSnapshotMaybeMerge() needs to
  297. // see in an IndexSnapshot before it decides to merge and persist
  298. // those segments
  299. var DefaultMinSegmentsForInMemoryMerge = 2
  300. // persistSnapshotMaybeMerge examines the snapshot and might merge and
  301. // persist the in-memory zap segments if there are enough of them
  302. func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
  303. bool, error) {
  304. // collect the in-memory zap segments (SegmentBase instances)
  305. var sbs []segment.Segment
  306. var sbsDrops []*roaring.Bitmap
  307. var sbsIndexes []int
  308. for i, segmentSnapshot := range snapshot.segment {
  309. if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); !ok {
  310. sbs = append(sbs, segmentSnapshot.segment)
  311. sbsDrops = append(sbsDrops, segmentSnapshot.deleted)
  312. sbsIndexes = append(sbsIndexes, i)
  313. }
  314. }
  315. if len(sbs) < DefaultMinSegmentsForInMemoryMerge {
  316. return false, nil
  317. }
  318. newSnapshot, newSegmentID, err := s.mergeSegmentBases(
  319. snapshot, sbs, sbsDrops, sbsIndexes)
  320. if err != nil {
  321. return false, err
  322. }
  323. if newSnapshot == nil {
  324. return false, nil
  325. }
  326. defer func() {
  327. _ = newSnapshot.DecRef()
  328. }()
  329. mergedSegmentIDs := map[uint64]struct{}{}
  330. for _, idx := range sbsIndexes {
  331. mergedSegmentIDs[snapshot.segment[idx].id] = struct{}{}
  332. }
  333. // construct a snapshot that's logically equivalent to the input
  334. // snapshot, but with merged segments replaced by the new segment
  335. equiv := &IndexSnapshot{
  336. parent: snapshot.parent,
  337. segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)),
  338. internal: snapshot.internal,
  339. epoch: snapshot.epoch,
  340. creator: "persistSnapshotMaybeMerge",
  341. }
  342. // copy to the equiv the segments that weren't replaced
  343. for _, segment := range snapshot.segment {
  344. if _, wasMerged := mergedSegmentIDs[segment.id]; !wasMerged {
  345. equiv.segment = append(equiv.segment, segment)
  346. }
  347. }
  348. // append to the equiv the new segment
  349. for _, segment := range newSnapshot.segment {
  350. if segment.id == newSegmentID {
  351. equiv.segment = append(equiv.segment, &SegmentSnapshot{
  352. id: newSegmentID,
  353. segment: segment.segment,
  354. deleted: nil, // nil since merging handled deletions
  355. })
  356. break
  357. }
  358. }
  359. err = s.persistSnapshotDirect(equiv)
  360. if err != nil {
  361. return false, err
  362. }
  363. return true, nil
  364. }
  365. func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
  366. // start a write transaction
  367. tx, err := s.rootBolt.Begin(true)
  368. if err != nil {
  369. return err
  370. }
  371. // defer rollback on error
  372. defer func() {
  373. if err != nil {
  374. _ = tx.Rollback()
  375. }
  376. }()
  377. snapshotsBucket, err := tx.CreateBucketIfNotExists(boltSnapshotsBucket)
  378. if err != nil {
  379. return err
  380. }
  381. newSnapshotKey := segment.EncodeUvarintAscending(nil, snapshot.epoch)
  382. snapshotBucket, err := snapshotsBucket.CreateBucketIfNotExists(newSnapshotKey)
  383. if err != nil {
  384. return err
  385. }
  386. // persist meta values
  387. metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey)
  388. if err != nil {
  389. return err
  390. }
  391. err = metaBucket.Put(boltMetaDataSegmentTypeKey, []byte(s.segPlugin.Type()))
  392. if err != nil {
  393. return err
  394. }
  395. buf := make([]byte, binary.MaxVarintLen32)
  396. binary.BigEndian.PutUint32(buf, s.segPlugin.Version())
  397. err = metaBucket.Put(boltMetaDataSegmentVersionKey, buf)
  398. if err != nil {
  399. return err
  400. }
  401. // persist internal values
  402. internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey)
  403. if err != nil {
  404. return err
  405. }
  406. // TODO optimize writing these in order?
  407. for k, v := range snapshot.internal {
  408. err = internalBucket.Put([]byte(k), v)
  409. if err != nil {
  410. return err
  411. }
  412. }
  413. var filenames []string
  414. newSegmentPaths := make(map[uint64]string)
  415. // first ensure that each segment in this snapshot has been persisted
  416. for _, segmentSnapshot := range snapshot.segment {
  417. snapshotSegmentKey := segment.EncodeUvarintAscending(nil, segmentSnapshot.id)
  418. snapshotSegmentBucket, err := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey)
  419. if err != nil {
  420. return err
  421. }
  422. switch seg := segmentSnapshot.segment.(type) {
  423. case segment.PersistedSegment:
  424. path := seg.Path()
  425. filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator))
  426. err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename))
  427. if err != nil {
  428. return err
  429. }
  430. filenames = append(filenames, filename)
  431. case segment.UnpersistedSegment:
  432. // need to persist this to disk
  433. filename := zapFileName(segmentSnapshot.id)
  434. path := s.path + string(os.PathSeparator) + filename
  435. err = seg.Persist(path)
  436. if err != nil {
  437. return fmt.Errorf("error persisting segment: %v", err)
  438. }
  439. newSegmentPaths[segmentSnapshot.id] = path
  440. err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename))
  441. if err != nil {
  442. return err
  443. }
  444. filenames = append(filenames, filename)
  445. default:
  446. return fmt.Errorf("unknown segment type: %T", seg)
  447. }
  448. // store current deleted bits
  449. var roaringBuf bytes.Buffer
  450. if segmentSnapshot.deleted != nil {
  451. _, err = segmentSnapshot.deleted.WriteTo(&roaringBuf)
  452. if err != nil {
  453. return fmt.Errorf("error persisting roaring bytes: %v", err)
  454. }
  455. err = snapshotSegmentBucket.Put(boltDeletedKey, roaringBuf.Bytes())
  456. if err != nil {
  457. return err
  458. }
  459. }
  460. }
  461. // we need to swap in a new root only when we've persisted 1 or
  462. // more segments -- whereby the new root would have 1-for-1
  463. // replacements of in-memory segments with file-based segments
  464. //
  465. // other cases like updates to internal values only, and/or when
  466. // there are only deletions, are already covered and persisted by
  467. // the newly populated boltdb snapshotBucket above
  468. if len(newSegmentPaths) > 0 {
  469. // now try to open all the new snapshots
  470. newSegments := make(map[uint64]segment.Segment)
  471. defer func() {
  472. for _, s := range newSegments {
  473. if s != nil {
  474. // cleanup segments that were opened but not
  475. // swapped into the new root
  476. _ = s.Close()
  477. }
  478. }
  479. }()
  480. for segmentID, path := range newSegmentPaths {
  481. newSegments[segmentID], err = s.segPlugin.Open(path)
  482. if err != nil {
  483. return fmt.Errorf("error opening new segment at %s, %v", path, err)
  484. }
  485. }
  486. persist := &persistIntroduction{
  487. persisted: newSegments,
  488. applied: make(notificationChan),
  489. }
  490. select {
  491. case <-s.closeCh:
  492. return segment.ErrClosed
  493. case s.persists <- persist:
  494. }
  495. select {
  496. case <-s.closeCh:
  497. return segment.ErrClosed
  498. case <-persist.applied:
  499. }
  500. }
  501. err = tx.Commit()
  502. if err != nil {
  503. return err
  504. }
  505. err = s.rootBolt.Sync()
  506. if err != nil {
  507. return err
  508. }
  509. // allow files to become eligible for removal after commit, such
  510. // as file segments from snapshots that came from the merger
  511. s.rootLock.Lock()
  512. for _, filename := range filenames {
  513. delete(s.ineligibleForRemoval, filename)
  514. }
  515. s.rootLock.Unlock()
  516. return nil
  517. }
  518. func zapFileName(epoch uint64) string {
  519. return fmt.Sprintf("%012x.zap", epoch)
  520. }
  521. // bolt snapshot code
  522. var boltSnapshotsBucket = []byte{'s'}
  523. var boltPathKey = []byte{'p'}
  524. var boltDeletedKey = []byte{'d'}
  525. var boltInternalKey = []byte{'i'}
  526. var boltMetaDataKey = []byte{'m'}
  527. var boltMetaDataSegmentTypeKey = []byte("type")
  528. var boltMetaDataSegmentVersionKey = []byte("version")
  529. func (s *Scorch) loadFromBolt() error {
  530. return s.rootBolt.View(func(tx *bolt.Tx) error {
  531. snapshots := tx.Bucket(boltSnapshotsBucket)
  532. if snapshots == nil {
  533. return nil
  534. }
  535. foundRoot := false
  536. c := snapshots.Cursor()
  537. for k, _ := c.Last(); k != nil; k, _ = c.Prev() {
  538. _, snapshotEpoch, err := segment.DecodeUvarintAscending(k)
  539. if err != nil {
  540. log.Printf("unable to parse segment epoch %x, continuing", k)
  541. continue
  542. }
  543. if foundRoot {
  544. s.AddEligibleForRemoval(snapshotEpoch)
  545. continue
  546. }
  547. snapshot := snapshots.Bucket(k)
  548. if snapshot == nil {
  549. log.Printf("snapshot key, but bucket missing %x, continuing", k)
  550. s.AddEligibleForRemoval(snapshotEpoch)
  551. continue
  552. }
  553. indexSnapshot, err := s.loadSnapshot(snapshot)
  554. if err != nil {
  555. log.Printf("unable to load snapshot, %v, continuing", err)
  556. s.AddEligibleForRemoval(snapshotEpoch)
  557. continue
  558. }
  559. indexSnapshot.epoch = snapshotEpoch
  560. // set the nextSegmentID
  561. s.nextSegmentID, err = s.maxSegmentIDOnDisk()
  562. if err != nil {
  563. return err
  564. }
  565. s.nextSegmentID++
  566. s.rootLock.Lock()
  567. s.nextSnapshotEpoch = snapshotEpoch + 1
  568. rootPrev := s.root
  569. s.root = indexSnapshot
  570. s.rootLock.Unlock()
  571. if rootPrev != nil {
  572. _ = rootPrev.DecRef()
  573. }
  574. foundRoot = true
  575. }
  576. return nil
  577. })
  578. }
  579. // LoadSnapshot loads the segment with the specified epoch
  580. // NOTE: this is currently ONLY intended to be used by the command-line tool
  581. func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
  582. err = s.rootBolt.View(func(tx *bolt.Tx) error {
  583. snapshots := tx.Bucket(boltSnapshotsBucket)
  584. if snapshots == nil {
  585. return nil
  586. }
  587. snapshotKey := segment.EncodeUvarintAscending(nil, epoch)
  588. snapshot := snapshots.Bucket(snapshotKey)
  589. if snapshot == nil {
  590. return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch)
  591. }
  592. rv, err = s.loadSnapshot(snapshot)
  593. return err
  594. })
  595. if err != nil {
  596. return nil, err
  597. }
  598. return rv, nil
  599. }
  600. func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
  601. rv := &IndexSnapshot{
  602. parent: s,
  603. internal: make(map[string][]byte),
  604. refs: 1,
  605. creator: "loadSnapshot",
  606. }
  607. // first we look for the meta-data bucket, this will tell us
  608. // which segment type/version was used for this snapshot
  609. // all operations for this scorch will use this type/version
  610. metaBucket := snapshot.Bucket(boltMetaDataKey)
  611. if metaBucket == nil {
  612. _ = rv.DecRef()
  613. return nil, fmt.Errorf("meta-data bucket missing")
  614. }
  615. segmentType := string(metaBucket.Get(boltMetaDataSegmentTypeKey))
  616. segmentVersion := binary.BigEndian.Uint32(
  617. metaBucket.Get(boltMetaDataSegmentVersionKey))
  618. err := s.loadSegmentPlugin(segmentType, segmentVersion)
  619. if err != nil {
  620. _ = rv.DecRef()
  621. return nil, fmt.Errorf(
  622. "unable to load correct segment wrapper: %v", err)
  623. }
  624. var running uint64
  625. c := snapshot.Cursor()
  626. for k, _ := c.First(); k != nil; k, _ = c.Next() {
  627. if k[0] == boltInternalKey[0] {
  628. internalBucket := snapshot.Bucket(k)
  629. err := internalBucket.ForEach(func(key []byte, val []byte) error {
  630. copiedVal := append([]byte(nil), val...)
  631. rv.internal[string(key)] = copiedVal
  632. return nil
  633. })
  634. if err != nil {
  635. _ = rv.DecRef()
  636. return nil, err
  637. }
  638. } else if k[0] != boltMetaDataKey[0] {
  639. segmentBucket := snapshot.Bucket(k)
  640. if segmentBucket == nil {
  641. _ = rv.DecRef()
  642. return nil, fmt.Errorf("segment key, but bucket missing % x", k)
  643. }
  644. segmentSnapshot, err := s.loadSegment(segmentBucket)
  645. if err != nil {
  646. _ = rv.DecRef()
  647. return nil, fmt.Errorf("failed to load segment: %v", err)
  648. }
  649. _, segmentSnapshot.id, err = segment.DecodeUvarintAscending(k)
  650. if err != nil {
  651. _ = rv.DecRef()
  652. return nil, fmt.Errorf("failed to decode segment id: %v", err)
  653. }
  654. rv.segment = append(rv.segment, segmentSnapshot)
  655. rv.offsets = append(rv.offsets, running)
  656. running += segmentSnapshot.segment.Count()
  657. }
  658. }
  659. return rv, nil
  660. }
  661. func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, error) {
  662. pathBytes := segmentBucket.Get(boltPathKey)
  663. if pathBytes == nil {
  664. return nil, fmt.Errorf("segment path missing")
  665. }
  666. segmentPath := s.path + string(os.PathSeparator) + string(pathBytes)
  667. segment, err := s.segPlugin.Open(segmentPath)
  668. if err != nil {
  669. return nil, fmt.Errorf("error opening bolt segment: %v", err)
  670. }
  671. rv := &SegmentSnapshot{
  672. segment: segment,
  673. cachedDocs: &cachedDocs{cache: nil},
  674. }
  675. deletedBytes := segmentBucket.Get(boltDeletedKey)
  676. if deletedBytes != nil {
  677. deletedBitmap := roaring.NewBitmap()
  678. r := bytes.NewReader(deletedBytes)
  679. _, err := deletedBitmap.ReadFrom(r)
  680. if err != nil {
  681. _ = segment.Close()
  682. return nil, fmt.Errorf("error reading deleted bytes: %v", err)
  683. }
  684. if !deletedBitmap.IsEmpty() {
  685. rv.deleted = deletedBitmap
  686. }
  687. }
  688. return rv, nil
  689. }
  690. type uint64Descending []uint64
  691. func (p uint64Descending) Len() int { return len(p) }
  692. func (p uint64Descending) Less(i, j int) bool { return p[i] > p[j] }
  693. func (p uint64Descending) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  694. func (s *Scorch) removeOldData() {
  695. removed, err := s.removeOldBoltSnapshots()
  696. if err != nil {
  697. s.fireAsyncError(fmt.Errorf("got err removing old bolt snapshots: %v", err))
  698. }
  699. atomic.AddUint64(&s.stats.TotSnapshotsRemovedFromMetaStore, uint64(removed))
  700. err = s.removeOldZapFiles()
  701. if err != nil {
  702. s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err))
  703. }
  704. }
  705. // NumSnapshotsToKeep represents how many recent, old snapshots to
  706. // keep around per Scorch instance. Useful for apps that require
  707. // rollback'ability.
  708. var NumSnapshotsToKeep = 1
  709. // Removes enough snapshots from the rootBolt so that the
  710. // s.eligibleForRemoval stays under the NumSnapshotsToKeep policy.
  711. func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
  712. persistedEpochs, err := s.RootBoltSnapshotEpochs()
  713. if err != nil {
  714. return 0, err
  715. }
  716. if len(persistedEpochs) <= s.numSnapshotsToKeep {
  717. // we need to keep everything
  718. return 0, nil
  719. }
  720. // make a map of epochs to protect from deletion
  721. protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep)
  722. for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] {
  723. protectedEpochs[epoch] = struct{}{}
  724. }
  725. var epochsToRemove []uint64
  726. var newEligible []uint64
  727. s.rootLock.Lock()
  728. for _, epoch := range s.eligibleForRemoval {
  729. if _, ok := protectedEpochs[epoch]; ok {
  730. // protected
  731. newEligible = append(newEligible, epoch)
  732. } else {
  733. epochsToRemove = append(epochsToRemove, epoch)
  734. }
  735. }
  736. s.eligibleForRemoval = newEligible
  737. s.rootLock.Unlock()
  738. if len(epochsToRemove) == 0 {
  739. return 0, nil
  740. }
  741. tx, err := s.rootBolt.Begin(true)
  742. if err != nil {
  743. return 0, err
  744. }
  745. defer func() {
  746. if err == nil {
  747. err = tx.Commit()
  748. } else {
  749. _ = tx.Rollback()
  750. }
  751. if err == nil {
  752. err = s.rootBolt.Sync()
  753. }
  754. }()
  755. snapshots := tx.Bucket(boltSnapshotsBucket)
  756. if snapshots == nil {
  757. return 0, nil
  758. }
  759. for _, epochToRemove := range epochsToRemove {
  760. k := segment.EncodeUvarintAscending(nil, epochToRemove)
  761. err = snapshots.DeleteBucket(k)
  762. if err == bolt.ErrBucketNotFound {
  763. err = nil
  764. }
  765. if err == nil {
  766. numRemoved++
  767. }
  768. }
  769. return numRemoved, err
  770. }
  771. func (s *Scorch) maxSegmentIDOnDisk() (uint64, error) {
  772. currFileInfos, err := ioutil.ReadDir(s.path)
  773. if err != nil {
  774. return 0, err
  775. }
  776. var rv uint64
  777. for _, finfo := range currFileInfos {
  778. fname := finfo.Name()
  779. if filepath.Ext(fname) == ".zap" {
  780. prefix := strings.TrimSuffix(fname, ".zap")
  781. id, err2 := strconv.ParseUint(prefix, 16, 64)
  782. if err2 != nil {
  783. return 0, err2
  784. }
  785. if id > rv {
  786. rv = id
  787. }
  788. }
  789. }
  790. return rv, err
  791. }
  792. // Removes any *.zap files which aren't listed in the rootBolt.
  793. func (s *Scorch) removeOldZapFiles() error {
  794. liveFileNames, err := s.loadZapFileNames()
  795. if err != nil {
  796. return err
  797. }
  798. currFileInfos, err := ioutil.ReadDir(s.path)
  799. if err != nil {
  800. return err
  801. }
  802. s.rootLock.RLock()
  803. for _, finfo := range currFileInfos {
  804. fname := finfo.Name()
  805. if filepath.Ext(fname) == ".zap" {
  806. if _, exists := liveFileNames[fname]; !exists && !s.ineligibleForRemoval[fname] {
  807. err := os.Remove(s.path + string(os.PathSeparator) + fname)
  808. if err != nil {
  809. log.Printf("got err removing file: %s, err: %v", fname, err)
  810. }
  811. }
  812. }
  813. }
  814. s.rootLock.RUnlock()
  815. return nil
  816. }
  817. func (s *Scorch) RootBoltSnapshotEpochs() ([]uint64, error) {
  818. var rv []uint64
  819. err := s.rootBolt.View(func(tx *bolt.Tx) error {
  820. snapshots := tx.Bucket(boltSnapshotsBucket)
  821. if snapshots == nil {
  822. return nil
  823. }
  824. sc := snapshots.Cursor()
  825. for sk, _ := sc.Last(); sk != nil; sk, _ = sc.Prev() {
  826. _, snapshotEpoch, err := segment.DecodeUvarintAscending(sk)
  827. if err != nil {
  828. continue
  829. }
  830. rv = append(rv, snapshotEpoch)
  831. }
  832. return nil
  833. })
  834. return rv, err
  835. }
  836. // Returns the *.zap file names that are listed in the rootBolt.
  837. func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) {
  838. rv := map[string]struct{}{}
  839. err := s.rootBolt.View(func(tx *bolt.Tx) error {
  840. snapshots := tx.Bucket(boltSnapshotsBucket)
  841. if snapshots == nil {
  842. return nil
  843. }
  844. sc := snapshots.Cursor()
  845. for sk, _ := sc.First(); sk != nil; sk, _ = sc.Next() {
  846. snapshot := snapshots.Bucket(sk)
  847. if snapshot == nil {
  848. continue
  849. }
  850. segc := snapshot.Cursor()
  851. for segk, _ := segc.First(); segk != nil; segk, _ = segc.Next() {
  852. if segk[0] == boltInternalKey[0] {
  853. continue
  854. }
  855. segmentBucket := snapshot.Bucket(segk)
  856. if segmentBucket == nil {
  857. continue
  858. }
  859. pathBytes := segmentBucket.Get(boltPathKey)
  860. if pathBytes == nil {
  861. continue
  862. }
  863. pathString := string(pathBytes)
  864. rv[string(pathString)] = struct{}{}
  865. }
  866. }
  867. return nil
  868. })
  869. return rv, err
  870. }