You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

snapshot_segment.go 6.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package scorch
  15. import (
  16. "bytes"
  17. "sync"
  18. "sync/atomic"
  19. "github.com/RoaringBitmap/roaring"
  20. "github.com/blevesearch/bleve/v2/size"
  21. index "github.com/blevesearch/bleve_index_api"
  22. segment "github.com/blevesearch/scorch_segment_api/v2"
  23. )
  24. var TermSeparator byte = 0xff
  25. var TermSeparatorSplitSlice = []byte{TermSeparator}
  26. type SegmentSnapshot struct {
  27. id uint64
  28. segment segment.Segment
  29. deleted *roaring.Bitmap
  30. creator string
  31. cachedDocs *cachedDocs
  32. }
  33. func (s *SegmentSnapshot) Segment() segment.Segment {
  34. return s.segment
  35. }
  36. func (s *SegmentSnapshot) Deleted() *roaring.Bitmap {
  37. return s.deleted
  38. }
  39. func (s *SegmentSnapshot) Id() uint64 {
  40. return s.id
  41. }
  42. func (s *SegmentSnapshot) FullSize() int64 {
  43. return int64(s.segment.Count())
  44. }
  45. func (s SegmentSnapshot) LiveSize() int64 {
  46. return int64(s.Count())
  47. }
  48. func (s *SegmentSnapshot) Close() error {
  49. return s.segment.Close()
  50. }
  51. func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.StoredFieldValueVisitor) error {
  52. return s.segment.VisitStoredFields(num, visitor)
  53. }
  54. func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) {
  55. return s.segment.DocID(num)
  56. }
  57. func (s *SegmentSnapshot) Count() uint64 {
  58. rv := s.segment.Count()
  59. if s.deleted != nil {
  60. rv -= s.deleted.GetCardinality()
  61. }
  62. return rv
  63. }
  64. func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
  65. rv, err := s.segment.DocNumbers(docIDs)
  66. if err != nil {
  67. return nil, err
  68. }
  69. if s.deleted != nil {
  70. rv.AndNot(s.deleted)
  71. }
  72. return rv, nil
  73. }
  74. // DocNumbersLive returns a bitmap containing doc numbers for all live docs
  75. func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap {
  76. rv := roaring.NewBitmap()
  77. rv.AddRange(0, s.segment.Count())
  78. if s.deleted != nil {
  79. rv.AndNot(s.deleted)
  80. }
  81. return rv
  82. }
  83. func (s *SegmentSnapshot) Fields() []string {
  84. return s.segment.Fields()
  85. }
  86. func (s *SegmentSnapshot) Size() (rv int) {
  87. rv = s.segment.Size()
  88. if s.deleted != nil {
  89. rv += int(s.deleted.GetSizeInBytes())
  90. }
  91. rv += s.cachedDocs.Size()
  92. return
  93. }
  94. type cachedFieldDocs struct {
  95. m sync.Mutex
  96. readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used.
  97. err error // Non-nil if there was an error when preparing this cachedFieldDocs.
  98. docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF.
  99. size uint64
  100. }
  101. func (cfd *cachedFieldDocs) Size() int {
  102. var rv int
  103. cfd.m.Lock()
  104. for _, entry := range cfd.docs {
  105. rv += 8 /* size of uint64 */ + len(entry)
  106. }
  107. cfd.m.Unlock()
  108. return rv
  109. }
  110. func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) {
  111. cfd.m.Lock()
  112. defer func() {
  113. close(cfd.readyCh)
  114. cfd.m.Unlock()
  115. }()
  116. cfd.size += uint64(size.SizeOfUint64) /* size field */
  117. dict, err := ss.segment.Dictionary(field)
  118. if err != nil {
  119. cfd.err = err
  120. return
  121. }
  122. var postings segment.PostingsList
  123. var postingsItr segment.PostingsIterator
  124. dictItr := dict.AutomatonIterator(nil, nil, nil)
  125. next, err := dictItr.Next()
  126. for err == nil && next != nil {
  127. var err1 error
  128. postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings)
  129. if err1 != nil {
  130. cfd.err = err1
  131. return
  132. }
  133. cfd.size += uint64(size.SizeOfUint64) /* map key */
  134. postingsItr = postings.Iterator(false, false, false, postingsItr)
  135. nextPosting, err2 := postingsItr.Next()
  136. for err2 == nil && nextPosting != nil {
  137. docNum := nextPosting.Number()
  138. cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...)
  139. cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator)
  140. cfd.size += uint64(len(next.Term) + 1) // map value
  141. nextPosting, err2 = postingsItr.Next()
  142. }
  143. if err2 != nil {
  144. cfd.err = err2
  145. return
  146. }
  147. next, err = dictItr.Next()
  148. }
  149. if err != nil {
  150. cfd.err = err
  151. return
  152. }
  153. }
  154. type cachedDocs struct {
  155. size uint64
  156. m sync.Mutex // As the cache is asynchronously prepared, need a lock
  157. cache map[string]*cachedFieldDocs // Keyed by field
  158. }
  159. func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error {
  160. c.m.Lock()
  161. if c.cache == nil {
  162. c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields()))
  163. }
  164. for _, field := range wantedFields {
  165. _, exists := c.cache[field]
  166. if !exists {
  167. c.cache[field] = &cachedFieldDocs{
  168. readyCh: make(chan struct{}),
  169. docs: make(map[uint64][]byte),
  170. }
  171. go c.cache[field].prepareField(field, ss)
  172. }
  173. }
  174. for _, field := range wantedFields {
  175. cachedFieldDocs := c.cache[field]
  176. c.m.Unlock()
  177. <-cachedFieldDocs.readyCh
  178. if cachedFieldDocs.err != nil {
  179. return cachedFieldDocs.err
  180. }
  181. c.m.Lock()
  182. }
  183. c.updateSizeLOCKED()
  184. c.m.Unlock()
  185. return nil
  186. }
  187. // hasFields returns true if the cache has all the given fields
  188. func (c *cachedDocs) hasFields(fields []string) bool {
  189. c.m.Lock()
  190. for _, field := range fields {
  191. if _, exists := c.cache[field]; !exists {
  192. c.m.Unlock()
  193. return false // found a field not in cache
  194. }
  195. }
  196. c.m.Unlock()
  197. return true
  198. }
  199. func (c *cachedDocs) Size() int {
  200. return int(atomic.LoadUint64(&c.size))
  201. }
  202. func (c *cachedDocs) updateSizeLOCKED() {
  203. sizeInBytes := 0
  204. for k, v := range c.cache { // cachedFieldDocs
  205. sizeInBytes += len(k)
  206. if v != nil {
  207. sizeInBytes += v.Size()
  208. }
  209. }
  210. atomic.StoreUint64(&c.size, uint64(sizeInBytes))
  211. }
  212. func (c *cachedDocs) visitDoc(localDocNum uint64,
  213. fields []string, visitor index.DocValueVisitor) {
  214. c.m.Lock()
  215. for _, field := range fields {
  216. if cachedFieldDocs, exists := c.cache[field]; exists {
  217. c.m.Unlock()
  218. <-cachedFieldDocs.readyCh
  219. c.m.Lock()
  220. if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
  221. for {
  222. i := bytes.Index(tlist, TermSeparatorSplitSlice)
  223. if i < 0 {
  224. break
  225. }
  226. visitor(field, tlist[0:i])
  227. tlist = tlist[i+1:]
  228. }
  229. }
  230. }
  231. }
  232. c.m.Unlock()
  233. }