You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. package bbolt
  2. import (
  3. "bytes"
  4. "fmt"
  5. "reflect"
  6. "sort"
  7. "unsafe"
  8. )
  9. // node represents an in-memory, deserialized page.
  10. type node struct {
  11. bucket *Bucket
  12. isLeaf bool
  13. unbalanced bool
  14. spilled bool
  15. key []byte
  16. pgid pgid
  17. parent *node
  18. children nodes
  19. inodes inodes
  20. }
  21. // root returns the top-level node this node is attached to.
  22. func (n *node) root() *node {
  23. if n.parent == nil {
  24. return n
  25. }
  26. return n.parent.root()
  27. }
  28. // minKeys returns the minimum number of inodes this node should have.
  29. func (n *node) minKeys() int {
  30. if n.isLeaf {
  31. return 1
  32. }
  33. return 2
  34. }
  35. // size returns the size of the node after serialization.
  36. func (n *node) size() int {
  37. sz, elsz := pageHeaderSize, n.pageElementSize()
  38. for i := 0; i < len(n.inodes); i++ {
  39. item := &n.inodes[i]
  40. sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
  41. }
  42. return int(sz)
  43. }
  44. // sizeLessThan returns true if the node is less than a given size.
  45. // This is an optimization to avoid calculating a large node when we only need
  46. // to know if it fits inside a certain page size.
  47. func (n *node) sizeLessThan(v uintptr) bool {
  48. sz, elsz := pageHeaderSize, n.pageElementSize()
  49. for i := 0; i < len(n.inodes); i++ {
  50. item := &n.inodes[i]
  51. sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
  52. if sz >= v {
  53. return false
  54. }
  55. }
  56. return true
  57. }
  58. // pageElementSize returns the size of each page element based on the type of node.
  59. func (n *node) pageElementSize() uintptr {
  60. if n.isLeaf {
  61. return leafPageElementSize
  62. }
  63. return branchPageElementSize
  64. }
  65. // childAt returns the child node at a given index.
  66. func (n *node) childAt(index int) *node {
  67. if n.isLeaf {
  68. panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
  69. }
  70. return n.bucket.node(n.inodes[index].pgid, n)
  71. }
  72. // childIndex returns the index of a given child node.
  73. func (n *node) childIndex(child *node) int {
  74. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
  75. return index
  76. }
  77. // numChildren returns the number of children.
  78. func (n *node) numChildren() int {
  79. return len(n.inodes)
  80. }
  81. // nextSibling returns the next node with the same parent.
  82. func (n *node) nextSibling() *node {
  83. if n.parent == nil {
  84. return nil
  85. }
  86. index := n.parent.childIndex(n)
  87. if index >= n.parent.numChildren()-1 {
  88. return nil
  89. }
  90. return n.parent.childAt(index + 1)
  91. }
  92. // prevSibling returns the previous node with the same parent.
  93. func (n *node) prevSibling() *node {
  94. if n.parent == nil {
  95. return nil
  96. }
  97. index := n.parent.childIndex(n)
  98. if index == 0 {
  99. return nil
  100. }
  101. return n.parent.childAt(index - 1)
  102. }
  103. // put inserts a key/value.
  104. func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
  105. if pgid >= n.bucket.tx.meta.pgid {
  106. panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
  107. } else if len(oldKey) <= 0 {
  108. panic("put: zero-length old key")
  109. } else if len(newKey) <= 0 {
  110. panic("put: zero-length new key")
  111. }
  112. // Find insertion index.
  113. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
  114. // Add capacity and shift nodes if we don't have an exact match and need to insert.
  115. exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
  116. if !exact {
  117. n.inodes = append(n.inodes, inode{})
  118. copy(n.inodes[index+1:], n.inodes[index:])
  119. }
  120. inode := &n.inodes[index]
  121. inode.flags = flags
  122. inode.key = newKey
  123. inode.value = value
  124. inode.pgid = pgid
  125. _assert(len(inode.key) > 0, "put: zero-length inode key")
  126. }
  127. // del removes a key from the node.
  128. func (n *node) del(key []byte) {
  129. // Find index of key.
  130. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
  131. // Exit if the key isn't found.
  132. if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
  133. return
  134. }
  135. // Delete inode from the node.
  136. n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
  137. // Mark the node as needing rebalancing.
  138. n.unbalanced = true
  139. }
  140. // read initializes the node from a page.
  141. func (n *node) read(p *page) {
  142. n.pgid = p.id
  143. n.isLeaf = ((p.flags & leafPageFlag) != 0)
  144. n.inodes = make(inodes, int(p.count))
  145. for i := 0; i < int(p.count); i++ {
  146. inode := &n.inodes[i]
  147. if n.isLeaf {
  148. elem := p.leafPageElement(uint16(i))
  149. inode.flags = elem.flags
  150. inode.key = elem.key()
  151. inode.value = elem.value()
  152. } else {
  153. elem := p.branchPageElement(uint16(i))
  154. inode.pgid = elem.pgid
  155. inode.key = elem.key()
  156. }
  157. _assert(len(inode.key) > 0, "read: zero-length inode key")
  158. }
  159. // Save first key so we can find the node in the parent when we spill.
  160. if len(n.inodes) > 0 {
  161. n.key = n.inodes[0].key
  162. _assert(len(n.key) > 0, "read: zero-length node key")
  163. } else {
  164. n.key = nil
  165. }
  166. }
  167. // write writes the items onto one or more pages.
  168. func (n *node) write(p *page) {
  169. // Initialize page.
  170. if n.isLeaf {
  171. p.flags |= leafPageFlag
  172. } else {
  173. p.flags |= branchPageFlag
  174. }
  175. if len(n.inodes) >= 0xFFFF {
  176. panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
  177. }
  178. p.count = uint16(len(n.inodes))
  179. // Stop here if there are no items to write.
  180. if p.count == 0 {
  181. return
  182. }
  183. // Loop over each item and write it to the page.
  184. bp := uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes))
  185. for i, item := range n.inodes {
  186. _assert(len(item.key) > 0, "write: zero-length inode key")
  187. // Write the page element.
  188. if n.isLeaf {
  189. elem := p.leafPageElement(uint16(i))
  190. elem.pos = uint32(bp - uintptr(unsafe.Pointer(elem)))
  191. elem.flags = item.flags
  192. elem.ksize = uint32(len(item.key))
  193. elem.vsize = uint32(len(item.value))
  194. } else {
  195. elem := p.branchPageElement(uint16(i))
  196. elem.pos = uint32(bp - uintptr(unsafe.Pointer(elem)))
  197. elem.ksize = uint32(len(item.key))
  198. elem.pgid = item.pgid
  199. _assert(elem.pgid != p.id, "write: circular dependency occurred")
  200. }
  201. // Create a slice to write into of needed size and advance
  202. // byte pointer for next iteration.
  203. klen, vlen := len(item.key), len(item.value)
  204. sz := klen + vlen
  205. b := *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{
  206. Data: bp,
  207. Len: sz,
  208. Cap: sz,
  209. }))
  210. bp += uintptr(sz)
  211. // Write data for the element to the end of the page.
  212. l := copy(b, item.key)
  213. copy(b[l:], item.value)
  214. }
  215. // DEBUG ONLY: n.dump()
  216. }
  217. // split breaks up a node into multiple smaller nodes, if appropriate.
  218. // This should only be called from the spill() function.
  219. func (n *node) split(pageSize uintptr) []*node {
  220. var nodes []*node
  221. node := n
  222. for {
  223. // Split node into two.
  224. a, b := node.splitTwo(pageSize)
  225. nodes = append(nodes, a)
  226. // If we can't split then exit the loop.
  227. if b == nil {
  228. break
  229. }
  230. // Set node to b so it gets split on the next iteration.
  231. node = b
  232. }
  233. return nodes
  234. }
  235. // splitTwo breaks up a node into two smaller nodes, if appropriate.
  236. // This should only be called from the split() function.
  237. func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
  238. // Ignore the split if the page doesn't have at least enough nodes for
  239. // two pages or if the nodes can fit in a single page.
  240. if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
  241. return n, nil
  242. }
  243. // Determine the threshold before starting a new node.
  244. var fillPercent = n.bucket.FillPercent
  245. if fillPercent < minFillPercent {
  246. fillPercent = minFillPercent
  247. } else if fillPercent > maxFillPercent {
  248. fillPercent = maxFillPercent
  249. }
  250. threshold := int(float64(pageSize) * fillPercent)
  251. // Determine split position and sizes of the two pages.
  252. splitIndex, _ := n.splitIndex(threshold)
  253. // Split node into two separate nodes.
  254. // If there's no parent then we'll need to create one.
  255. if n.parent == nil {
  256. n.parent = &node{bucket: n.bucket, children: []*node{n}}
  257. }
  258. // Create a new node and add it to the parent.
  259. next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
  260. n.parent.children = append(n.parent.children, next)
  261. // Split inodes across two nodes.
  262. next.inodes = n.inodes[splitIndex:]
  263. n.inodes = n.inodes[:splitIndex]
  264. // Update the statistics.
  265. n.bucket.tx.stats.Split++
  266. return n, next
  267. }
  268. // splitIndex finds the position where a page will fill a given threshold.
  269. // It returns the index as well as the size of the first page.
  270. // This is only be called from split().
  271. func (n *node) splitIndex(threshold int) (index, sz uintptr) {
  272. sz = pageHeaderSize
  273. // Loop until we only have the minimum number of keys required for the second page.
  274. for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
  275. index = uintptr(i)
  276. inode := n.inodes[i]
  277. elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value))
  278. // If we have at least the minimum number of keys and adding another
  279. // node would put us over the threshold then exit and return.
  280. if index >= minKeysPerPage && sz+elsize > uintptr(threshold) {
  281. break
  282. }
  283. // Add the element size to the total size.
  284. sz += elsize
  285. }
  286. return
  287. }
  288. // spill writes the nodes to dirty pages and splits nodes as it goes.
  289. // Returns an error if dirty pages cannot be allocated.
  290. func (n *node) spill() error {
  291. var tx = n.bucket.tx
  292. if n.spilled {
  293. return nil
  294. }
  295. // Spill child nodes first. Child nodes can materialize sibling nodes in
  296. // the case of split-merge so we cannot use a range loop. We have to check
  297. // the children size on every loop iteration.
  298. sort.Sort(n.children)
  299. for i := 0; i < len(n.children); i++ {
  300. if err := n.children[i].spill(); err != nil {
  301. return err
  302. }
  303. }
  304. // We no longer need the child list because it's only used for spill tracking.
  305. n.children = nil
  306. // Split nodes into appropriate sizes. The first node will always be n.
  307. var nodes = n.split(uintptr(tx.db.pageSize))
  308. for _, node := range nodes {
  309. // Add node's page to the freelist if it's not new.
  310. if node.pgid > 0 {
  311. tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
  312. node.pgid = 0
  313. }
  314. // Allocate contiguous space for the node.
  315. p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
  316. if err != nil {
  317. return err
  318. }
  319. // Write the node.
  320. if p.id >= tx.meta.pgid {
  321. panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
  322. }
  323. node.pgid = p.id
  324. node.write(p)
  325. node.spilled = true
  326. // Insert into parent inodes.
  327. if node.parent != nil {
  328. var key = node.key
  329. if key == nil {
  330. key = node.inodes[0].key
  331. }
  332. node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
  333. node.key = node.inodes[0].key
  334. _assert(len(node.key) > 0, "spill: zero-length node key")
  335. }
  336. // Update the statistics.
  337. tx.stats.Spill++
  338. }
  339. // If the root node split and created a new root then we need to spill that
  340. // as well. We'll clear out the children to make sure it doesn't try to respill.
  341. if n.parent != nil && n.parent.pgid == 0 {
  342. n.children = nil
  343. return n.parent.spill()
  344. }
  345. return nil
  346. }
  347. // rebalance attempts to combine the node with sibling nodes if the node fill
  348. // size is below a threshold or if there are not enough keys.
  349. func (n *node) rebalance() {
  350. if !n.unbalanced {
  351. return
  352. }
  353. n.unbalanced = false
  354. // Update statistics.
  355. n.bucket.tx.stats.Rebalance++
  356. // Ignore if node is above threshold (25%) and has enough keys.
  357. var threshold = n.bucket.tx.db.pageSize / 4
  358. if n.size() > threshold && len(n.inodes) > n.minKeys() {
  359. return
  360. }
  361. // Root node has special handling.
  362. if n.parent == nil {
  363. // If root node is a branch and only has one node then collapse it.
  364. if !n.isLeaf && len(n.inodes) == 1 {
  365. // Move root's child up.
  366. child := n.bucket.node(n.inodes[0].pgid, n)
  367. n.isLeaf = child.isLeaf
  368. n.inodes = child.inodes[:]
  369. n.children = child.children
  370. // Reparent all child nodes being moved.
  371. for _, inode := range n.inodes {
  372. if child, ok := n.bucket.nodes[inode.pgid]; ok {
  373. child.parent = n
  374. }
  375. }
  376. // Remove old child.
  377. child.parent = nil
  378. delete(n.bucket.nodes, child.pgid)
  379. child.free()
  380. }
  381. return
  382. }
  383. // If node has no keys then just remove it.
  384. if n.numChildren() == 0 {
  385. n.parent.del(n.key)
  386. n.parent.removeChild(n)
  387. delete(n.bucket.nodes, n.pgid)
  388. n.free()
  389. n.parent.rebalance()
  390. return
  391. }
  392. _assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
  393. // Destination node is right sibling if idx == 0, otherwise left sibling.
  394. var target *node
  395. var useNextSibling = (n.parent.childIndex(n) == 0)
  396. if useNextSibling {
  397. target = n.nextSibling()
  398. } else {
  399. target = n.prevSibling()
  400. }
  401. // If both this node and the target node are too small then merge them.
  402. if useNextSibling {
  403. // Reparent all child nodes being moved.
  404. for _, inode := range target.inodes {
  405. if child, ok := n.bucket.nodes[inode.pgid]; ok {
  406. child.parent.removeChild(child)
  407. child.parent = n
  408. child.parent.children = append(child.parent.children, child)
  409. }
  410. }
  411. // Copy over inodes from target and remove target.
  412. n.inodes = append(n.inodes, target.inodes...)
  413. n.parent.del(target.key)
  414. n.parent.removeChild(target)
  415. delete(n.bucket.nodes, target.pgid)
  416. target.free()
  417. } else {
  418. // Reparent all child nodes being moved.
  419. for _, inode := range n.inodes {
  420. if child, ok := n.bucket.nodes[inode.pgid]; ok {
  421. child.parent.removeChild(child)
  422. child.parent = target
  423. child.parent.children = append(child.parent.children, child)
  424. }
  425. }
  426. // Copy over inodes to target and remove node.
  427. target.inodes = append(target.inodes, n.inodes...)
  428. n.parent.del(n.key)
  429. n.parent.removeChild(n)
  430. delete(n.bucket.nodes, n.pgid)
  431. n.free()
  432. }
  433. // Either this node or the target node was deleted from the parent so rebalance it.
  434. n.parent.rebalance()
  435. }
  436. // removes a node from the list of in-memory children.
  437. // This does not affect the inodes.
  438. func (n *node) removeChild(target *node) {
  439. for i, child := range n.children {
  440. if child == target {
  441. n.children = append(n.children[:i], n.children[i+1:]...)
  442. return
  443. }
  444. }
  445. }
  446. // dereference causes the node to copy all its inode key/value references to heap memory.
  447. // This is required when the mmap is reallocated so inodes are not pointing to stale data.
  448. func (n *node) dereference() {
  449. if n.key != nil {
  450. key := make([]byte, len(n.key))
  451. copy(key, n.key)
  452. n.key = key
  453. _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
  454. }
  455. for i := range n.inodes {
  456. inode := &n.inodes[i]
  457. key := make([]byte, len(inode.key))
  458. copy(key, inode.key)
  459. inode.key = key
  460. _assert(len(inode.key) > 0, "dereference: zero-length inode key")
  461. value := make([]byte, len(inode.value))
  462. copy(value, inode.value)
  463. inode.value = value
  464. }
  465. // Recursively dereference children.
  466. for _, child := range n.children {
  467. child.dereference()
  468. }
  469. // Update statistics.
  470. n.bucket.tx.stats.NodeDeref++
  471. }
  472. // free adds the node's underlying page to the freelist.
  473. func (n *node) free() {
  474. if n.pgid != 0 {
  475. n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
  476. n.pgid = 0
  477. }
  478. }
  479. // dump writes the contents of the node to STDERR for debugging purposes.
  480. /*
  481. func (n *node) dump() {
  482. // Write node header.
  483. var typ = "branch"
  484. if n.isLeaf {
  485. typ = "leaf"
  486. }
  487. warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
  488. // Write out abbreviated version of each item.
  489. for _, item := range n.inodes {
  490. if n.isLeaf {
  491. if item.flags&bucketLeafFlag != 0 {
  492. bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
  493. warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
  494. } else {
  495. warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
  496. }
  497. } else {
  498. warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
  499. }
  500. }
  501. warn("")
  502. }
  503. */
  504. type nodes []*node
  505. func (s nodes) Len() int { return len(s) }
  506. func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  507. func (s nodes) Less(i, j int) bool {
  508. return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1
  509. }
  510. // inode represents an internal node inside of a node.
  511. // It can be used to point to elements in a page or point
  512. // to an element which hasn't been added to a page yet.
  513. type inode struct {
  514. flags uint32
  515. pgid pgid
  516. key []byte
  517. value []byte
  518. }
  519. type inodes []inode