You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

docvalues.go 8.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package zap
  15. import (
  16. "bytes"
  17. "encoding/binary"
  18. "fmt"
  19. "math"
  20. "reflect"
  21. "sort"
  22. "github.com/blevesearch/bleve/index"
  23. "github.com/blevesearch/bleve/index/scorch/segment"
  24. "github.com/blevesearch/bleve/size"
  25. "github.com/golang/snappy"
  26. )
  27. var reflectStaticSizedocValueReader int
  28. func init() {
  29. var dvi docValueReader
  30. reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size())
  31. }
  32. type docNumTermsVisitor func(docNum uint64, terms []byte) error
  33. type docVisitState struct {
  34. dvrs map[uint16]*docValueReader
  35. segment *SegmentBase
  36. }
  37. type docValueReader struct {
  38. field string
  39. curChunkNum uint64
  40. chunkOffsets []uint64
  41. dvDataLoc uint64
  42. curChunkHeader []MetaData
  43. curChunkData []byte // compressed data cache
  44. uncompressed []byte // temp buf for snappy decompression
  45. }
  46. func (di *docValueReader) size() int {
  47. return reflectStaticSizedocValueReader + size.SizeOfPtr +
  48. len(di.field) +
  49. len(di.chunkOffsets)*size.SizeOfUint64 +
  50. len(di.curChunkHeader)*reflectStaticSizeMetaData +
  51. len(di.curChunkData)
  52. }
  53. func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader {
  54. if rv == nil {
  55. rv = &docValueReader{}
  56. }
  57. rv.field = di.field
  58. rv.curChunkNum = math.MaxUint64
  59. rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable
  60. rv.dvDataLoc = di.dvDataLoc
  61. rv.curChunkHeader = rv.curChunkHeader[:0]
  62. rv.curChunkData = nil
  63. rv.uncompressed = rv.uncompressed[:0]
  64. return rv
  65. }
  66. func (di *docValueReader) curChunkNumber() uint64 {
  67. return di.curChunkNum
  68. }
  69. func (s *SegmentBase) loadFieldDocValueReader(field string,
  70. fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) {
  71. // get the docValue offset for the given fields
  72. if fieldDvLocStart == fieldNotUninverted {
  73. // no docValues found, nothing to do
  74. return nil, nil
  75. }
  76. // read the number of chunks, and chunk offsets position
  77. var numChunks, chunkOffsetsPosition uint64
  78. if fieldDvLocEnd-fieldDvLocStart > 16 {
  79. numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd])
  80. // read the length of chunk offsets
  81. chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8])
  82. // acquire position of chunk offsets
  83. chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen
  84. } else {
  85. return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart)
  86. }
  87. fdvIter := &docValueReader{
  88. curChunkNum: math.MaxUint64,
  89. field: field,
  90. chunkOffsets: make([]uint64, int(numChunks)),
  91. }
  92. // read the chunk offsets
  93. var offset uint64
  94. for i := 0; i < int(numChunks); i++ {
  95. loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64])
  96. if read <= 0 {
  97. return nil, fmt.Errorf("corrupted chunk offset during segment load")
  98. }
  99. fdvIter.chunkOffsets[i] = loc
  100. offset += uint64(read)
  101. }
  102. // set the data offset
  103. fdvIter.dvDataLoc = fieldDvLocStart
  104. return fdvIter, nil
  105. }
  106. func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error {
  107. // advance to the chunk where the docValues
  108. // reside for the given docNum
  109. destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
  110. start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
  111. if start >= end {
  112. di.curChunkHeader = di.curChunkHeader[:0]
  113. di.curChunkData = nil
  114. di.curChunkNum = chunkNumber
  115. di.uncompressed = di.uncompressed[:0]
  116. return nil
  117. }
  118. destChunkDataLoc += start
  119. curChunkEnd += end
  120. // read the number of docs reside in the chunk
  121. numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
  122. if read <= 0 {
  123. return fmt.Errorf("failed to read the chunk")
  124. }
  125. chunkMetaLoc := destChunkDataLoc + uint64(read)
  126. offset := uint64(0)
  127. if cap(di.curChunkHeader) < int(numDocs) {
  128. di.curChunkHeader = make([]MetaData, int(numDocs))
  129. } else {
  130. di.curChunkHeader = di.curChunkHeader[:int(numDocs)]
  131. }
  132. for i := 0; i < int(numDocs); i++ {
  133. di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
  134. offset += uint64(read)
  135. di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
  136. offset += uint64(read)
  137. }
  138. compressedDataLoc := chunkMetaLoc + offset
  139. dataLength := curChunkEnd - compressedDataLoc
  140. di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
  141. di.curChunkNum = chunkNumber
  142. di.uncompressed = di.uncompressed[:0]
  143. return nil
  144. }
  145. func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error {
  146. for i := 0; i < len(di.chunkOffsets); i++ {
  147. err := di.loadDvChunk(uint64(i), s)
  148. if err != nil {
  149. return err
  150. }
  151. if di.curChunkData == nil || len(di.curChunkHeader) == 0 {
  152. continue
  153. }
  154. // uncompress the already loaded data
  155. uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
  156. if err != nil {
  157. return err
  158. }
  159. di.uncompressed = uncompressed
  160. start := uint64(0)
  161. for _, entry := range di.curChunkHeader {
  162. err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset])
  163. if err != nil {
  164. return err
  165. }
  166. start = entry.DocDvOffset
  167. }
  168. }
  169. return nil
  170. }
  171. func (di *docValueReader) visitDocValues(docNum uint64,
  172. visitor index.DocumentFieldTermVisitor) error {
  173. // binary search the term locations for the docNum
  174. start, end := di.getDocValueLocs(docNum)
  175. if start == math.MaxUint64 || end == math.MaxUint64 || start == end {
  176. return nil
  177. }
  178. var uncompressed []byte
  179. var err error
  180. // use the uncompressed copy if available
  181. if len(di.uncompressed) > 0 {
  182. uncompressed = di.uncompressed
  183. } else {
  184. // uncompress the already loaded data
  185. uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
  186. if err != nil {
  187. return err
  188. }
  189. di.uncompressed = uncompressed
  190. }
  191. // pick the terms for the given docNum
  192. uncompressed = uncompressed[start:end]
  193. for {
  194. i := bytes.Index(uncompressed, termSeparatorSplitSlice)
  195. if i < 0 {
  196. break
  197. }
  198. visitor(di.field, uncompressed[0:i])
  199. uncompressed = uncompressed[i+1:]
  200. }
  201. return nil
  202. }
  203. func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) {
  204. i := sort.Search(len(di.curChunkHeader), func(i int) bool {
  205. return di.curChunkHeader[i].DocNum >= docNum
  206. })
  207. if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
  208. return ReadDocValueBoundary(i, di.curChunkHeader)
  209. }
  210. return math.MaxUint64, math.MaxUint64
  211. }
  212. // VisitDocumentFieldTerms is an implementation of the
  213. // DocumentFieldTermVisitable interface
  214. func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
  215. visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) (
  216. segment.DocVisitState, error) {
  217. dvs, ok := dvsIn.(*docVisitState)
  218. if !ok || dvs == nil {
  219. dvs = &docVisitState{}
  220. } else {
  221. if dvs.segment != s {
  222. dvs.segment = s
  223. dvs.dvrs = nil
  224. }
  225. }
  226. var fieldIDPlus1 uint16
  227. if dvs.dvrs == nil {
  228. dvs.dvrs = make(map[uint16]*docValueReader, len(fields))
  229. for _, field := range fields {
  230. if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
  231. continue
  232. }
  233. fieldID := fieldIDPlus1 - 1
  234. if dvIter, exists := s.fieldDvReaders[fieldID]; exists &&
  235. dvIter != nil {
  236. dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID])
  237. }
  238. }
  239. }
  240. // find the chunkNumber where the docValues are stored
  241. // NOTE: doc values continue to use legacy chunk mode
  242. chunkFactor, err := getChunkSize(LegacyChunkMode, 0, 0)
  243. if err != nil {
  244. return nil, err
  245. }
  246. docInChunk := localDocNum / chunkFactor
  247. var dvr *docValueReader
  248. for _, field := range fields {
  249. if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
  250. continue
  251. }
  252. fieldID := fieldIDPlus1 - 1
  253. if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil {
  254. // check if the chunk is already loaded
  255. if docInChunk != dvr.curChunkNumber() {
  256. err := dvr.loadDvChunk(docInChunk, s)
  257. if err != nil {
  258. return dvs, err
  259. }
  260. }
  261. _ = dvr.visitDocValues(localDocNum, visitor)
  262. }
  263. }
  264. return dvs, nil
  265. }
  266. // VisitableDocValueFields returns the list of fields with
  267. // persisted doc value terms ready to be visitable using the
  268. // VisitDocumentFieldTerms method.
  269. func (s *SegmentBase) VisitableDocValueFields() ([]string, error) {
  270. return s.fieldDvNames, nil
  271. }