You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package zap
  15. import (
  16. "bytes"
  17. "encoding/binary"
  18. "fmt"
  19. "io"
  20. "os"
  21. "sync"
  22. "github.com/RoaringBitmap/roaring"
  23. "github.com/Smerity/govarint"
  24. "github.com/blevesearch/bleve/index/scorch/segment"
  25. "github.com/couchbase/vellum"
  26. mmap "github.com/edsrzf/mmap-go"
  27. "github.com/golang/snappy"
  28. )
  29. // Open returns a zap impl of a segment
  30. func Open(path string) (segment.Segment, error) {
  31. f, err := os.Open(path)
  32. if err != nil {
  33. return nil, err
  34. }
  35. mm, err := mmap.Map(f, mmap.RDONLY, 0)
  36. if err != nil {
  37. // mmap failed, try to close the file
  38. _ = f.Close()
  39. return nil, err
  40. }
  41. rv := &Segment{
  42. SegmentBase: SegmentBase{
  43. mem: mm[0 : len(mm)-FooterSize],
  44. fieldsMap: make(map[string]uint16),
  45. fieldDvIterMap: make(map[uint16]*docValueIterator),
  46. },
  47. f: f,
  48. mm: mm,
  49. path: path,
  50. refs: 1,
  51. }
  52. err = rv.loadConfig()
  53. if err != nil {
  54. _ = rv.Close()
  55. return nil, err
  56. }
  57. err = rv.loadFields()
  58. if err != nil {
  59. _ = rv.Close()
  60. return nil, err
  61. }
  62. err = rv.loadDvIterators()
  63. if err != nil {
  64. _ = rv.Close()
  65. return nil, err
  66. }
  67. return rv, nil
  68. }
  69. // SegmentBase is a memory only, read-only implementation of the
  70. // segment.Segment interface, using zap's data representation.
  71. type SegmentBase struct {
  72. mem []byte
  73. memCRC uint32
  74. chunkFactor uint32
  75. fieldsMap map[string]uint16 // fieldName -> fieldID+1
  76. fieldsInv []string // fieldID -> fieldName
  77. numDocs uint64
  78. storedIndexOffset uint64
  79. fieldsIndexOffset uint64
  80. docValueOffset uint64
  81. dictLocs []uint64
  82. fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field
  83. }
  84. func (sb *SegmentBase) AddRef() {}
  85. func (sb *SegmentBase) DecRef() (err error) { return nil }
  86. func (sb *SegmentBase) Close() (err error) { return nil }
  87. // Segment implements a persisted segment.Segment interface, by
  88. // embedding an mmap()'ed SegmentBase.
  89. type Segment struct {
  90. SegmentBase
  91. f *os.File
  92. mm mmap.MMap
  93. path string
  94. version uint32
  95. crc uint32
  96. m sync.Mutex // Protects the fields that follow.
  97. refs int64
  98. }
  99. func (s *Segment) SizeInBytes() uint64 {
  100. // 8 /* size of file pointer */
  101. // 4 /* size of version -> uint32 */
  102. // 4 /* size of crc -> uint32 */
  103. sizeOfUints := 16
  104. sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints
  105. // mutex, refs -> int64
  106. sizeInBytes += 16
  107. // do not include the mmap'ed part
  108. return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem))
  109. }
  110. func (s *SegmentBase) SizeInBytes() uint64 {
  111. // 4 /* size of memCRC -> uint32 */
  112. // 4 /* size of chunkFactor -> uint32 */
  113. // 8 /* size of numDocs -> uint64 */
  114. // 8 /* size of storedIndexOffset -> uint64 */
  115. // 8 /* size of fieldsIndexOffset -> uint64 */
  116. // 8 /* size of docValueOffset -> uint64 */
  117. sizeInBytes := 40
  118. sizeInBytes += len(s.mem) + int(segment.SizeOfSlice)
  119. // fieldsMap
  120. for k, _ := range s.fieldsMap {
  121. sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */
  122. }
  123. sizeInBytes += int(segment.SizeOfMap) /* overhead from map */
  124. // fieldsInv, dictLocs
  125. for _, entry := range s.fieldsInv {
  126. sizeInBytes += (len(entry) + int(segment.SizeOfString))
  127. }
  128. sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */
  129. sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */
  130. // fieldDvIterMap
  131. sizeInBytes += len(s.fieldDvIterMap) *
  132. int(segment.SizeOfPointer+2 /* size of uint16 */)
  133. for _, entry := range s.fieldDvIterMap {
  134. if entry != nil {
  135. sizeInBytes += int(entry.sizeInBytes())
  136. }
  137. }
  138. sizeInBytes += int(segment.SizeOfMap)
  139. return uint64(sizeInBytes)
  140. }
  141. func (s *Segment) AddRef() {
  142. s.m.Lock()
  143. s.refs++
  144. s.m.Unlock()
  145. }
  146. func (s *Segment) DecRef() (err error) {
  147. s.m.Lock()
  148. s.refs--
  149. if s.refs == 0 {
  150. err = s.closeActual()
  151. }
  152. s.m.Unlock()
  153. return err
  154. }
  155. func (s *Segment) loadConfig() error {
  156. crcOffset := len(s.mm) - 4
  157. s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4])
  158. verOffset := crcOffset - 4
  159. s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
  160. if s.version != version {
  161. return fmt.Errorf("unsupported version %d", s.version)
  162. }
  163. chunkOffset := verOffset - 4
  164. s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4])
  165. docValueOffset := chunkOffset - 8
  166. s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8])
  167. fieldsIndexOffset := docValueOffset - 8
  168. s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8])
  169. storedIndexOffset := fieldsIndexOffset - 8
  170. s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8])
  171. numDocsOffset := storedIndexOffset - 8
  172. s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8])
  173. return nil
  174. }
  175. func (s *SegmentBase) loadFields() error {
  176. // NOTE for now we assume the fields index immediately preceeds
  177. // the footer, and if this changes, need to adjust accordingly (or
  178. // store explicit length), where s.mem was sliced from s.mm in Open().
  179. fieldsIndexEnd := uint64(len(s.mem))
  180. // iterate through fields index
  181. var fieldID uint64
  182. for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd {
  183. addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8])
  184. dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd])
  185. n := uint64(read)
  186. s.dictLocs = append(s.dictLocs, dictLoc)
  187. var nameLen uint64
  188. nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd])
  189. n += uint64(read)
  190. name := string(s.mem[addr+n : addr+n+nameLen])
  191. s.fieldsInv = append(s.fieldsInv, name)
  192. s.fieldsMap[name] = uint16(fieldID + 1)
  193. fieldID++
  194. }
  195. return nil
  196. }
  197. // Dictionary returns the term dictionary for the specified field
  198. func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) {
  199. dict, err := s.dictionary(field)
  200. if err == nil && dict == nil {
  201. return &segment.EmptyDictionary{}, nil
  202. }
  203. return dict, err
  204. }
  205. func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
  206. fieldIDPlus1 := sb.fieldsMap[field]
  207. if fieldIDPlus1 > 0 {
  208. rv = &Dictionary{
  209. sb: sb,
  210. field: field,
  211. fieldID: fieldIDPlus1 - 1,
  212. }
  213. dictStart := sb.dictLocs[rv.fieldID]
  214. if dictStart > 0 {
  215. // read the length of the vellum data
  216. vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64])
  217. fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen]
  218. if fstBytes != nil {
  219. rv.fst, err = vellum.Load(fstBytes)
  220. if err != nil {
  221. return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
  222. }
  223. }
  224. }
  225. }
  226. return rv, nil
  227. }
  228. // VisitDocument invokes the DocFieldValueVistor for each stored field
  229. // for the specified doc number
  230. func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
  231. // first make sure this is a valid number in this segment
  232. if num < s.numDocs {
  233. meta, compressed := s.getDocStoredMetaAndCompressed(num)
  234. uncompressed, err := snappy.Decode(nil, compressed)
  235. if err != nil {
  236. return err
  237. }
  238. // now decode meta and process
  239. reader := bytes.NewReader(meta)
  240. decoder := govarint.NewU64Base128Decoder(reader)
  241. keepGoing := true
  242. for keepGoing {
  243. field, err := decoder.GetU64()
  244. if err == io.EOF {
  245. break
  246. }
  247. if err != nil {
  248. return err
  249. }
  250. typ, err := decoder.GetU64()
  251. if err != nil {
  252. return err
  253. }
  254. offset, err := decoder.GetU64()
  255. if err != nil {
  256. return err
  257. }
  258. l, err := decoder.GetU64()
  259. if err != nil {
  260. return err
  261. }
  262. numap, err := decoder.GetU64()
  263. if err != nil {
  264. return err
  265. }
  266. var arrayPos []uint64
  267. if numap > 0 {
  268. arrayPos = make([]uint64, numap)
  269. for i := 0; i < int(numap); i++ {
  270. ap, err := decoder.GetU64()
  271. if err != nil {
  272. return err
  273. }
  274. arrayPos[i] = ap
  275. }
  276. }
  277. value := uncompressed[offset : offset+l]
  278. keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
  279. }
  280. }
  281. return nil
  282. }
  283. // Count returns the number of documents in this segment.
  284. func (s *SegmentBase) Count() uint64 {
  285. return s.numDocs
  286. }
  287. // DocNumbers returns a bitset corresponding to the doc numbers of all the
  288. // provided _id strings
  289. func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) {
  290. rv := roaring.New()
  291. if len(s.fieldsMap) > 0 {
  292. idDict, err := s.dictionary("_id")
  293. if err != nil {
  294. return nil, err
  295. }
  296. var postings *PostingsList
  297. for _, id := range ids {
  298. postings, err = idDict.postingsList([]byte(id), nil, postings)
  299. if err != nil {
  300. return nil, err
  301. }
  302. if postings.postings != nil {
  303. rv.Or(postings.postings)
  304. }
  305. }
  306. }
  307. return rv, nil
  308. }
  309. // Fields returns the field names used in this segment
  310. func (s *SegmentBase) Fields() []string {
  311. return s.fieldsInv
  312. }
  313. // Path returns the path of this segment on disk
  314. func (s *Segment) Path() string {
  315. return s.path
  316. }
  317. // Close releases all resources associated with this segment
  318. func (s *Segment) Close() (err error) {
  319. return s.DecRef()
  320. }
  321. func (s *Segment) closeActual() (err error) {
  322. if s.mm != nil {
  323. err = s.mm.Unmap()
  324. }
  325. // try to close file even if unmap failed
  326. if s.f != nil {
  327. err2 := s.f.Close()
  328. if err == nil {
  329. // try to return first error
  330. err = err2
  331. }
  332. }
  333. return
  334. }
  335. // some helpers i started adding for the command-line utility
  336. // Data returns the underlying mmaped data slice
  337. func (s *Segment) Data() []byte {
  338. return s.mm
  339. }
  340. // CRC returns the CRC value stored in the file footer
  341. func (s *Segment) CRC() uint32 {
  342. return s.crc
  343. }
  344. // Version returns the file version in the file footer
  345. func (s *Segment) Version() uint32 {
  346. return s.version
  347. }
  348. // ChunkFactor returns the chunk factor in the file footer
  349. func (s *Segment) ChunkFactor() uint32 {
  350. return s.chunkFactor
  351. }
  352. // FieldsIndexOffset returns the fields index offset in the file footer
  353. func (s *Segment) FieldsIndexOffset() uint64 {
  354. return s.fieldsIndexOffset
  355. }
  356. // StoredIndexOffset returns the stored value index offset in the file footer
  357. func (s *Segment) StoredIndexOffset() uint64 {
  358. return s.storedIndexOffset
  359. }
  360. // DocValueOffset returns the docValue offset in the file footer
  361. func (s *Segment) DocValueOffset() uint64 {
  362. return s.docValueOffset
  363. }
  364. // NumDocs returns the number of documents in the file footer
  365. func (s *Segment) NumDocs() uint64 {
  366. return s.numDocs
  367. }
  368. // DictAddr is a helper function to compute the file offset where the
  369. // dictionary is stored for the specified field.
  370. func (s *Segment) DictAddr(field string) (uint64, error) {
  371. fieldIDPlus1, ok := s.fieldsMap[field]
  372. if !ok {
  373. return 0, fmt.Errorf("no such field '%s'", field)
  374. }
  375. return s.dictLocs[fieldIDPlus1-1], nil
  376. }
  377. func (s *SegmentBase) loadDvIterators() error {
  378. if s.docValueOffset == fieldNotUninverted {
  379. return nil
  380. }
  381. var read uint64
  382. for fieldID, field := range s.fieldsInv {
  383. fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
  384. if n <= 0 {
  385. return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID)
  386. }
  387. s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc)
  388. read += uint64(n)
  389. }
  390. return nil
  391. }