You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

docvalues.go 8.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package zap
  15. import (
  16. "bytes"
  17. "encoding/binary"
  18. "fmt"
  19. "math"
  20. "reflect"
  21. "sort"
  22. index "github.com/blevesearch/bleve_index_api"
  23. segment "github.com/blevesearch/scorch_segment_api/v2"
  24. "github.com/golang/snappy"
  25. )
  26. var reflectStaticSizedocValueReader int
  27. func init() {
  28. var dvi docValueReader
  29. reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size())
  30. }
  31. type docNumTermsVisitor func(docNum uint64, terms []byte) error
  32. type docVisitState struct {
  33. dvrs map[uint16]*docValueReader
  34. segment *SegmentBase
  35. }
  36. type docValueReader struct {
  37. field string
  38. curChunkNum uint64
  39. chunkOffsets []uint64
  40. dvDataLoc uint64
  41. curChunkHeader []MetaData
  42. curChunkData []byte // compressed data cache
  43. uncompressed []byte // temp buf for snappy decompression
  44. }
  45. func (di *docValueReader) size() int {
  46. return reflectStaticSizedocValueReader + SizeOfPtr +
  47. len(di.field) +
  48. len(di.chunkOffsets)*SizeOfUint64 +
  49. len(di.curChunkHeader)*reflectStaticSizeMetaData +
  50. len(di.curChunkData)
  51. }
  52. func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader {
  53. if rv == nil {
  54. rv = &docValueReader{}
  55. }
  56. rv.field = di.field
  57. rv.curChunkNum = math.MaxUint64
  58. rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable
  59. rv.dvDataLoc = di.dvDataLoc
  60. rv.curChunkHeader = rv.curChunkHeader[:0]
  61. rv.curChunkData = nil
  62. rv.uncompressed = rv.uncompressed[:0]
  63. return rv
  64. }
  65. func (di *docValueReader) curChunkNumber() uint64 {
  66. return di.curChunkNum
  67. }
  68. func (s *SegmentBase) loadFieldDocValueReader(field string,
  69. fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) {
  70. // get the docValue offset for the given fields
  71. if fieldDvLocStart == fieldNotUninverted {
  72. // no docValues found, nothing to do
  73. return nil, nil
  74. }
  75. // read the number of chunks, and chunk offsets position
  76. var numChunks, chunkOffsetsPosition uint64
  77. if fieldDvLocEnd-fieldDvLocStart > 16 {
  78. numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd])
  79. // read the length of chunk offsets
  80. chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8])
  81. // acquire position of chunk offsets
  82. chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen
  83. } else {
  84. return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart)
  85. }
  86. fdvIter := &docValueReader{
  87. curChunkNum: math.MaxUint64,
  88. field: field,
  89. chunkOffsets: make([]uint64, int(numChunks)),
  90. }
  91. // read the chunk offsets
  92. var offset uint64
  93. for i := 0; i < int(numChunks); i++ {
  94. loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64])
  95. if read <= 0 {
  96. return nil, fmt.Errorf("corrupted chunk offset during segment load")
  97. }
  98. fdvIter.chunkOffsets[i] = loc
  99. offset += uint64(read)
  100. }
  101. // set the data offset
  102. fdvIter.dvDataLoc = fieldDvLocStart
  103. return fdvIter, nil
  104. }
  105. func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error {
  106. // advance to the chunk where the docValues
  107. // reside for the given docNum
  108. destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
  109. start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
  110. if start >= end {
  111. di.curChunkHeader = di.curChunkHeader[:0]
  112. di.curChunkData = nil
  113. di.curChunkNum = chunkNumber
  114. di.uncompressed = di.uncompressed[:0]
  115. return nil
  116. }
  117. destChunkDataLoc += start
  118. curChunkEnd += end
  119. // read the number of docs reside in the chunk
  120. numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
  121. if read <= 0 {
  122. return fmt.Errorf("failed to read the chunk")
  123. }
  124. chunkMetaLoc := destChunkDataLoc + uint64(read)
  125. offset := uint64(0)
  126. if cap(di.curChunkHeader) < int(numDocs) {
  127. di.curChunkHeader = make([]MetaData, int(numDocs))
  128. } else {
  129. di.curChunkHeader = di.curChunkHeader[:int(numDocs)]
  130. }
  131. for i := 0; i < int(numDocs); i++ {
  132. di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
  133. offset += uint64(read)
  134. di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
  135. offset += uint64(read)
  136. }
  137. compressedDataLoc := chunkMetaLoc + offset
  138. dataLength := curChunkEnd - compressedDataLoc
  139. di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
  140. di.curChunkNum = chunkNumber
  141. di.uncompressed = di.uncompressed[:0]
  142. return nil
  143. }
  144. func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error {
  145. for i := 0; i < len(di.chunkOffsets); i++ {
  146. err := di.loadDvChunk(uint64(i), s)
  147. if err != nil {
  148. return err
  149. }
  150. if di.curChunkData == nil || len(di.curChunkHeader) == 0 {
  151. continue
  152. }
  153. // uncompress the already loaded data
  154. uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
  155. if err != nil {
  156. return err
  157. }
  158. di.uncompressed = uncompressed
  159. start := uint64(0)
  160. for _, entry := range di.curChunkHeader {
  161. err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset])
  162. if err != nil {
  163. return err
  164. }
  165. start = entry.DocDvOffset
  166. }
  167. }
  168. return nil
  169. }
  170. func (di *docValueReader) visitDocValues(docNum uint64,
  171. visitor index.DocValueVisitor) error {
  172. // binary search the term locations for the docNum
  173. start, end := di.getDocValueLocs(docNum)
  174. if start == math.MaxUint64 || end == math.MaxUint64 || start == end {
  175. return nil
  176. }
  177. var uncompressed []byte
  178. var err error
  179. // use the uncompressed copy if available
  180. if len(di.uncompressed) > 0 {
  181. uncompressed = di.uncompressed
  182. } else {
  183. // uncompress the already loaded data
  184. uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
  185. if err != nil {
  186. return err
  187. }
  188. di.uncompressed = uncompressed
  189. }
  190. // pick the terms for the given docNum
  191. uncompressed = uncompressed[start:end]
  192. for {
  193. i := bytes.Index(uncompressed, termSeparatorSplitSlice)
  194. if i < 0 {
  195. break
  196. }
  197. visitor(di.field, uncompressed[0:i])
  198. uncompressed = uncompressed[i+1:]
  199. }
  200. return nil
  201. }
  202. func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) {
  203. i := sort.Search(len(di.curChunkHeader), func(i int) bool {
  204. return di.curChunkHeader[i].DocNum >= docNum
  205. })
  206. if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
  207. return ReadDocValueBoundary(i, di.curChunkHeader)
  208. }
  209. return math.MaxUint64, math.MaxUint64
  210. }
  211. // VisitDocValues is an implementation of the
  212. // DocValueVisitable interface
  213. func (s *SegmentBase) VisitDocValues(localDocNum uint64, fields []string,
  214. visitor index.DocValueVisitor, dvsIn segment.DocVisitState) (
  215. segment.DocVisitState, error) {
  216. dvs, ok := dvsIn.(*docVisitState)
  217. if !ok || dvs == nil {
  218. dvs = &docVisitState{}
  219. } else {
  220. if dvs.segment != s {
  221. dvs.segment = s
  222. dvs.dvrs = nil
  223. }
  224. }
  225. var fieldIDPlus1 uint16
  226. if dvs.dvrs == nil {
  227. dvs.dvrs = make(map[uint16]*docValueReader, len(fields))
  228. for _, field := range fields {
  229. if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
  230. continue
  231. }
  232. fieldID := fieldIDPlus1 - 1
  233. if dvIter, exists := s.fieldDvReaders[fieldID]; exists &&
  234. dvIter != nil {
  235. dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID])
  236. }
  237. }
  238. }
  239. // find the chunkNumber where the docValues are stored
  240. // NOTE: doc values continue to use legacy chunk mode
  241. chunkFactor, err := getChunkSize(LegacyChunkMode, 0, 0)
  242. if err != nil {
  243. return nil, err
  244. }
  245. docInChunk := localDocNum / chunkFactor
  246. var dvr *docValueReader
  247. for _, field := range fields {
  248. if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
  249. continue
  250. }
  251. fieldID := fieldIDPlus1 - 1
  252. if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil {
  253. // check if the chunk is already loaded
  254. if docInChunk != dvr.curChunkNumber() {
  255. err := dvr.loadDvChunk(docInChunk, s)
  256. if err != nil {
  257. return dvs, err
  258. }
  259. }
  260. _ = dvr.visitDocValues(localDocNum, visitor)
  261. }
  262. }
  263. return dvs, nil
  264. }
  265. // VisitableDocValueFields returns the list of fields with
  266. // persisted doc value terms ready to be visitable using the
  267. // VisitDocumentFieldTerms method.
  268. func (s *SegmentBase) VisitableDocValueFields() ([]string, error) {
  269. return s.fieldDvNames, nil
  270. }