You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

docvalues.go 6.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package zap
  15. import (
  16. "bytes"
  17. "encoding/binary"
  18. "fmt"
  19. "math"
  20. "sort"
  21. "github.com/blevesearch/bleve/index"
  22. "github.com/blevesearch/bleve/index/scorch/segment"
  23. "github.com/golang/snappy"
  24. )
  25. type docValueIterator struct {
  26. field string
  27. curChunkNum uint64
  28. numChunks uint64
  29. chunkLens []uint64
  30. dvDataLoc uint64
  31. curChunkHeader []MetaData
  32. curChunkData []byte // compressed data cache
  33. }
  34. func (di *docValueIterator) sizeInBytes() uint64 {
  35. // curChunkNum, numChunks, dvDataLoc --> uint64
  36. sizeInBytes := 24
  37. // field
  38. sizeInBytes += (len(di.field) + int(segment.SizeOfString))
  39. // chunkLens, curChunkHeader
  40. sizeInBytes += len(di.chunkLens)*8 +
  41. len(di.curChunkHeader)*24 +
  42. int(segment.SizeOfSlice*2) /* overhead from slices */
  43. // curChunkData is mmap'ed, not included
  44. return uint64(sizeInBytes)
  45. }
  46. func (di *docValueIterator) fieldName() string {
  47. return di.field
  48. }
  49. func (di *docValueIterator) curChunkNumber() uint64 {
  50. return di.curChunkNum
  51. }
  52. func (s *SegmentBase) loadFieldDocValueIterator(field string,
  53. fieldDvLoc uint64) (*docValueIterator, error) {
  54. // get the docValue offset for the given fields
  55. if fieldDvLoc == fieldNotUninverted {
  56. return nil, fmt.Errorf("loadFieldDocValueIterator: "+
  57. "no docValues found for field: %s", field)
  58. }
  59. // read the number of chunks, chunk lengths
  60. var offset, clen uint64
  61. numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64])
  62. if read <= 0 {
  63. return nil, fmt.Errorf("failed to read the field "+
  64. "doc values for field %s", field)
  65. }
  66. offset += uint64(read)
  67. fdvIter := &docValueIterator{
  68. curChunkNum: math.MaxUint64,
  69. field: field,
  70. chunkLens: make([]uint64, int(numChunks)),
  71. }
  72. for i := 0; i < int(numChunks); i++ {
  73. clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64])
  74. if read <= 0 {
  75. return nil, fmt.Errorf("corrupted chunk length during segment load")
  76. }
  77. fdvIter.chunkLens[i] = clen
  78. offset += uint64(read)
  79. }
  80. fdvIter.dvDataLoc = fieldDvLoc + offset
  81. return fdvIter, nil
  82. }
  83. func (di *docValueIterator) loadDvChunk(chunkNumber,
  84. localDocNum uint64, s *SegmentBase) error {
  85. // advance to the chunk where the docValues
  86. // reside for the given docNum
  87. destChunkDataLoc := di.dvDataLoc
  88. for i := 0; i < int(chunkNumber); i++ {
  89. destChunkDataLoc += di.chunkLens[i]
  90. }
  91. curChunkSize := di.chunkLens[chunkNumber]
  92. // read the number of docs reside in the chunk
  93. numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
  94. if read <= 0 {
  95. return fmt.Errorf("failed to read the chunk")
  96. }
  97. chunkMetaLoc := destChunkDataLoc + uint64(read)
  98. offset := uint64(0)
  99. di.curChunkHeader = make([]MetaData, int(numDocs))
  100. for i := 0; i < int(numDocs); i++ {
  101. di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
  102. offset += uint64(read)
  103. di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
  104. offset += uint64(read)
  105. di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
  106. offset += uint64(read)
  107. }
  108. compressedDataLoc := chunkMetaLoc + offset
  109. dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc
  110. di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
  111. di.curChunkNum = chunkNumber
  112. return nil
  113. }
  114. func (di *docValueIterator) visitDocValues(docNum uint64,
  115. visitor index.DocumentFieldTermVisitor) error {
  116. // binary search the term locations for the docNum
  117. start, length := di.getDocValueLocs(docNum)
  118. if start == math.MaxUint64 || length == math.MaxUint64 {
  119. return nil
  120. }
  121. // uncompress the already loaded data
  122. uncompressed, err := snappy.Decode(nil, di.curChunkData)
  123. if err != nil {
  124. return err
  125. }
  126. // pick the terms for the given docNum
  127. uncompressed = uncompressed[start : start+length]
  128. for {
  129. i := bytes.Index(uncompressed, termSeparatorSplitSlice)
  130. if i < 0 {
  131. break
  132. }
  133. visitor(di.field, uncompressed[0:i])
  134. uncompressed = uncompressed[i+1:]
  135. }
  136. return nil
  137. }
  138. func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) {
  139. i := sort.Search(len(di.curChunkHeader), func(i int) bool {
  140. return di.curChunkHeader[i].DocNum >= docNum
  141. })
  142. if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
  143. return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen
  144. }
  145. return math.MaxUint64, math.MaxUint64
  146. }
  147. // VisitDocumentFieldTerms is an implementation of the
  148. // DocumentFieldTermVisitable interface
  149. func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
  150. visitor index.DocumentFieldTermVisitor) error {
  151. fieldIDPlus1 := uint16(0)
  152. ok := true
  153. for _, field := range fields {
  154. if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
  155. continue
  156. }
  157. // find the chunkNumber where the docValues are stored
  158. docInChunk := localDocNum / uint64(s.chunkFactor)
  159. if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists &&
  160. dvIter != nil {
  161. // check if the chunk is already loaded
  162. if docInChunk != dvIter.curChunkNumber() {
  163. err := dvIter.loadDvChunk(docInChunk, localDocNum, s)
  164. if err != nil {
  165. continue
  166. }
  167. }
  168. _ = dvIter.visitDocValues(localDocNum, visitor)
  169. }
  170. }
  171. return nil
  172. }
  173. // VisitableDocValueFields returns the list of fields with
  174. // persisted doc value terms ready to be visitable using the
  175. // VisitDocumentFieldTerms method.
  176. func (s *Segment) VisitableDocValueFields() ([]string, error) {
  177. var rv []string
  178. for fieldID, field := range s.fieldsInv {
  179. if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok &&
  180. dvIter != nil {
  181. rv = append(rv, field)
  182. }
  183. }
  184. return rv, nil
  185. }