Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

build.go 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package zap
  15. import (
  16. "bufio"
  17. "bytes"
  18. "encoding/binary"
  19. "math"
  20. "os"
  21. "sort"
  22. "github.com/Smerity/govarint"
  23. "github.com/blevesearch/bleve/index/scorch/segment/mem"
  24. "github.com/couchbase/vellum"
  25. "github.com/golang/snappy"
  26. )
  27. const version uint32 = 3
  28. const fieldNotUninverted = math.MaxUint64
  29. // PersistSegmentBase persists SegmentBase in the zap file format.
  30. func PersistSegmentBase(sb *SegmentBase, path string) error {
  31. flag := os.O_RDWR | os.O_CREATE
  32. f, err := os.OpenFile(path, flag, 0600)
  33. if err != nil {
  34. return err
  35. }
  36. cleanup := func() {
  37. _ = f.Close()
  38. _ = os.Remove(path)
  39. }
  40. br := bufio.NewWriter(f)
  41. _, err = br.Write(sb.mem)
  42. if err != nil {
  43. cleanup()
  44. return err
  45. }
  46. err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.docValueOffset,
  47. sb.chunkFactor, sb.memCRC, br)
  48. if err != nil {
  49. cleanup()
  50. return err
  51. }
  52. err = br.Flush()
  53. if err != nil {
  54. cleanup()
  55. return err
  56. }
  57. err = f.Sync()
  58. if err != nil {
  59. cleanup()
  60. return err
  61. }
  62. err = f.Close()
  63. if err != nil {
  64. cleanup()
  65. return err
  66. }
  67. return nil
  68. }
  69. // PersistSegment takes the in-memory segment and persists it to
  70. // the specified path in the zap file format.
  71. func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error {
  72. flag := os.O_RDWR | os.O_CREATE
  73. f, err := os.OpenFile(path, flag, 0600)
  74. if err != nil {
  75. return err
  76. }
  77. cleanup := func() {
  78. _ = f.Close()
  79. _ = os.Remove(path)
  80. }
  81. // buffer the output
  82. br := bufio.NewWriter(f)
  83. // wrap it for counting (tracking offsets)
  84. cr := NewCountHashWriter(br)
  85. numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err :=
  86. persistBase(memSegment, cr, chunkFactor)
  87. if err != nil {
  88. cleanup()
  89. return err
  90. }
  91. err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset,
  92. chunkFactor, cr.Sum32(), cr)
  93. if err != nil {
  94. cleanup()
  95. return err
  96. }
  97. err = br.Flush()
  98. if err != nil {
  99. cleanup()
  100. return err
  101. }
  102. err = f.Sync()
  103. if err != nil {
  104. cleanup()
  105. return err
  106. }
  107. err = f.Close()
  108. if err != nil {
  109. cleanup()
  110. return err
  111. }
  112. return nil
  113. }
  114. func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) (
  115. numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
  116. dictLocs []uint64, err error) {
  117. docValueOffset = uint64(fieldNotUninverted)
  118. if len(memSegment.Stored) > 0 {
  119. storedIndexOffset, err = persistStored(memSegment, cr)
  120. if err != nil {
  121. return 0, 0, 0, 0, nil, err
  122. }
  123. freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor)
  124. if err != nil {
  125. return 0, 0, 0, 0, nil, err
  126. }
  127. postingsListLocs, err := persistPostingsLocs(memSegment, cr)
  128. if err != nil {
  129. return 0, 0, 0, 0, nil, err
  130. }
  131. postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
  132. if err != nil {
  133. return 0, 0, 0, 0, nil, err
  134. }
  135. dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
  136. if err != nil {
  137. return 0, 0, 0, 0, nil, err
  138. }
  139. docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor)
  140. if err != nil {
  141. return 0, 0, 0, 0, nil, err
  142. }
  143. } else {
  144. dictLocs = make([]uint64, len(memSegment.FieldsInv))
  145. }
  146. fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs)
  147. if err != nil {
  148. return 0, 0, 0, 0, nil, err
  149. }
  150. return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset,
  151. dictLocs, nil
  152. }
  153. func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) {
  154. var curr int
  155. var metaBuf bytes.Buffer
  156. var data, compressed []byte
  157. metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
  158. docNumOffsets := make(map[int]uint64, len(memSegment.Stored))
  159. for docNum, storedValues := range memSegment.Stored {
  160. if docNum != 0 {
  161. // reset buffer if necessary
  162. curr = 0
  163. metaBuf.Reset()
  164. data = data[:0]
  165. compressed = compressed[:0]
  166. }
  167. st := memSegment.StoredTypes[docNum]
  168. sp := memSegment.StoredPos[docNum]
  169. // encode fields in order
  170. for fieldID := range memSegment.FieldsInv {
  171. if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok {
  172. stf := st[uint16(fieldID)]
  173. spf := sp[uint16(fieldID)]
  174. var err2 error
  175. curr, data, err2 = persistStoredFieldValues(fieldID,
  176. storedFieldValues, stf, spf, curr, metaEncoder, data)
  177. if err2 != nil {
  178. return 0, err2
  179. }
  180. }
  181. }
  182. metaEncoder.Close()
  183. metaBytes := metaBuf.Bytes()
  184. // compress the data
  185. compressed = snappy.Encode(compressed, data)
  186. // record where we're about to start writing
  187. docNumOffsets[docNum] = uint64(w.Count())
  188. // write out the meta len and compressed data len
  189. _, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
  190. if err != nil {
  191. return 0, err
  192. }
  193. // now write the meta
  194. _, err = w.Write(metaBytes)
  195. if err != nil {
  196. return 0, err
  197. }
  198. // now write the compressed data
  199. _, err = w.Write(compressed)
  200. if err != nil {
  201. return 0, err
  202. }
  203. }
  204. // return value is the start of the stored index
  205. rv := uint64(w.Count())
  206. // now write out the stored doc index
  207. for docNum := range memSegment.Stored {
  208. err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
  209. if err != nil {
  210. return 0, err
  211. }
  212. }
  213. return rv, nil
  214. }
  215. func persistStoredFieldValues(fieldID int,
  216. storedFieldValues [][]byte, stf []byte, spf [][]uint64,
  217. curr int, metaEncoder *govarint.Base128Encoder, data []byte) (
  218. int, []byte, error) {
  219. for i := 0; i < len(storedFieldValues); i++ {
  220. // encode field
  221. _, err := metaEncoder.PutU64(uint64(fieldID))
  222. if err != nil {
  223. return 0, nil, err
  224. }
  225. // encode type
  226. _, err = metaEncoder.PutU64(uint64(stf[i]))
  227. if err != nil {
  228. return 0, nil, err
  229. }
  230. // encode start offset
  231. _, err = metaEncoder.PutU64(uint64(curr))
  232. if err != nil {
  233. return 0, nil, err
  234. }
  235. // end len
  236. _, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
  237. if err != nil {
  238. return 0, nil, err
  239. }
  240. // encode number of array pos
  241. _, err = metaEncoder.PutU64(uint64(len(spf[i])))
  242. if err != nil {
  243. return 0, nil, err
  244. }
  245. // encode all array positions
  246. for _, pos := range spf[i] {
  247. _, err = metaEncoder.PutU64(pos)
  248. if err != nil {
  249. return 0, nil, err
  250. }
  251. }
  252. data = append(data, storedFieldValues[i]...)
  253. curr += len(storedFieldValues[i])
  254. }
  255. return curr, data, nil
  256. }
  257. func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) {
  258. var freqOffsets, locOfffsets []uint64
  259. tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
  260. for postingID := range memSegment.Postings {
  261. if postingID != 0 {
  262. tfEncoder.Reset()
  263. }
  264. freqs := memSegment.Freqs[postingID]
  265. norms := memSegment.Norms[postingID]
  266. postingsListItr := memSegment.Postings[postingID].Iterator()
  267. var offset int
  268. for postingsListItr.HasNext() {
  269. docNum := uint64(postingsListItr.Next())
  270. // put freq
  271. err := tfEncoder.Add(docNum, freqs[offset])
  272. if err != nil {
  273. return nil, nil, err
  274. }
  275. // put norm
  276. norm := norms[offset]
  277. normBits := math.Float32bits(norm)
  278. err = tfEncoder.Add(docNum, uint64(normBits))
  279. if err != nil {
  280. return nil, nil, err
  281. }
  282. offset++
  283. }
  284. // record where this postings freq info starts
  285. freqOffsets = append(freqOffsets, uint64(w.Count()))
  286. tfEncoder.Close()
  287. _, err := tfEncoder.Write(w)
  288. if err != nil {
  289. return nil, nil, err
  290. }
  291. }
  292. // now do it again for the locations
  293. locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
  294. for postingID := range memSegment.Postings {
  295. if postingID != 0 {
  296. locEncoder.Reset()
  297. }
  298. freqs := memSegment.Freqs[postingID]
  299. locfields := memSegment.Locfields[postingID]
  300. locpos := memSegment.Locpos[postingID]
  301. locstarts := memSegment.Locstarts[postingID]
  302. locends := memSegment.Locends[postingID]
  303. locarraypos := memSegment.Locarraypos[postingID]
  304. postingsListItr := memSegment.Postings[postingID].Iterator()
  305. var offset int
  306. var locOffset int
  307. for postingsListItr.HasNext() {
  308. docNum := uint64(postingsListItr.Next())
  309. for i := 0; i < int(freqs[offset]); i++ {
  310. if len(locfields) > 0 {
  311. // put field
  312. err := locEncoder.Add(docNum, uint64(locfields[locOffset]))
  313. if err != nil {
  314. return nil, nil, err
  315. }
  316. // put pos
  317. err = locEncoder.Add(docNum, locpos[locOffset])
  318. if err != nil {
  319. return nil, nil, err
  320. }
  321. // put start
  322. err = locEncoder.Add(docNum, locstarts[locOffset])
  323. if err != nil {
  324. return nil, nil, err
  325. }
  326. // put end
  327. err = locEncoder.Add(docNum, locends[locOffset])
  328. if err != nil {
  329. return nil, nil, err
  330. }
  331. // put the number of array positions to follow
  332. num := len(locarraypos[locOffset])
  333. err = locEncoder.Add(docNum, uint64(num))
  334. if err != nil {
  335. return nil, nil, err
  336. }
  337. // put each array position
  338. for _, pos := range locarraypos[locOffset] {
  339. err = locEncoder.Add(docNum, pos)
  340. if err != nil {
  341. return nil, nil, err
  342. }
  343. }
  344. }
  345. locOffset++
  346. }
  347. offset++
  348. }
  349. // record where this postings loc info starts
  350. locOfffsets = append(locOfffsets, uint64(w.Count()))
  351. locEncoder.Close()
  352. _, err := locEncoder.Write(w)
  353. if err != nil {
  354. return nil, nil, err
  355. }
  356. }
  357. return freqOffsets, locOfffsets, nil
  358. }
  359. func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) {
  360. rv = make([]uint64, 0, len(memSegment.PostingsLocs))
  361. var reuseBuf bytes.Buffer
  362. reuseBufVarint := make([]byte, binary.MaxVarintLen64)
  363. for postingID := range memSegment.PostingsLocs {
  364. // record where we start this posting loc
  365. rv = append(rv, uint64(w.Count()))
  366. // write out the length and bitmap
  367. _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint)
  368. if err != nil {
  369. return nil, err
  370. }
  371. }
  372. return rv, nil
  373. }
  374. func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
  375. postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) {
  376. rv = make([]uint64, 0, len(memSegment.Postings))
  377. var reuseBuf bytes.Buffer
  378. reuseBufVarint := make([]byte, binary.MaxVarintLen64)
  379. for postingID := range memSegment.Postings {
  380. // record where we start this posting list
  381. rv = append(rv, uint64(w.Count()))
  382. // write out the term info, loc info, and loc posting list offset
  383. _, err = writeUvarints(w, freqOffsets[postingID],
  384. locOffsets[postingID], postingsListLocs[postingID])
  385. if err != nil {
  386. return nil, err
  387. }
  388. // write out the length and bitmap
  389. _, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint)
  390. if err != nil {
  391. return nil, err
  392. }
  393. }
  394. return rv, nil
  395. }
  396. func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) {
  397. rv := make([]uint64, 0, len(memSegment.DictKeys))
  398. varintBuf := make([]byte, binary.MaxVarintLen64)
  399. var buffer bytes.Buffer
  400. for fieldID, fieldTerms := range memSegment.DictKeys {
  401. if fieldID != 0 {
  402. buffer.Reset()
  403. }
  404. // start a new vellum for this field
  405. builder, err := vellum.New(&buffer, nil)
  406. if err != nil {
  407. return nil, err
  408. }
  409. dict := memSegment.Dicts[fieldID]
  410. // now walk the dictionary in order of fieldTerms (already sorted)
  411. for _, fieldTerm := range fieldTerms {
  412. postingID := dict[fieldTerm] - 1
  413. postingsAddr := postingsLocs[postingID]
  414. err = builder.Insert([]byte(fieldTerm), postingsAddr)
  415. if err != nil {
  416. return nil, err
  417. }
  418. }
  419. err = builder.Close()
  420. if err != nil {
  421. return nil, err
  422. }
  423. // record where this dictionary starts
  424. rv = append(rv, uint64(w.Count()))
  425. vellumData := buffer.Bytes()
  426. // write out the length of the vellum data
  427. n := binary.PutUvarint(varintBuf, uint64(len(vellumData)))
  428. _, err = w.Write(varintBuf[:n])
  429. if err != nil {
  430. return nil, err
  431. }
  432. // write this vellum to disk
  433. _, err = w.Write(vellumData)
  434. if err != nil {
  435. return nil, err
  436. }
  437. }
  438. return rv, nil
  439. }
  440. type docIDRange []uint64
  441. func (a docIDRange) Len() int { return len(a) }
  442. func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  443. func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] }
  444. func persistDocValues(memSegment *mem.Segment, w *CountHashWriter,
  445. chunkFactor uint32) (map[uint16]uint64, error) {
  446. fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv))
  447. fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
  448. for fieldID := range memSegment.DocValueFields {
  449. field := memSegment.FieldsInv[fieldID]
  450. docTermMap := make(map[uint64][]byte, 0)
  451. dict, err := memSegment.Dictionary(field)
  452. if err != nil {
  453. return nil, err
  454. }
  455. dictItr := dict.Iterator()
  456. next, err := dictItr.Next()
  457. for err == nil && next != nil {
  458. postings, err1 := dict.PostingsList(next.Term, nil)
  459. if err1 != nil {
  460. return nil, err
  461. }
  462. postingsItr := postings.Iterator()
  463. nextPosting, err2 := postingsItr.Next()
  464. for err2 == nil && nextPosting != nil {
  465. docNum := nextPosting.Number()
  466. docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...)
  467. docTermMap[docNum] = append(docTermMap[docNum], termSeparator)
  468. nextPosting, err2 = postingsItr.Next()
  469. }
  470. if err2 != nil {
  471. return nil, err2
  472. }
  473. next, err = dictItr.Next()
  474. }
  475. if err != nil {
  476. return nil, err
  477. }
  478. // sort wrt to docIDs
  479. var docNumbers docIDRange
  480. for k := range docTermMap {
  481. docNumbers = append(docNumbers, k)
  482. }
  483. sort.Sort(docNumbers)
  484. for _, docNum := range docNumbers {
  485. err = fdvEncoder.Add(docNum, docTermMap[docNum])
  486. if err != nil {
  487. return nil, err
  488. }
  489. }
  490. fieldChunkOffsets[fieldID] = uint64(w.Count())
  491. err = fdvEncoder.Close()
  492. if err != nil {
  493. return nil, err
  494. }
  495. // persist the doc value details for this field
  496. _, err = fdvEncoder.Write(w)
  497. if err != nil {
  498. return nil, err
  499. }
  500. // reseting encoder for the next field
  501. fdvEncoder.Reset()
  502. }
  503. return fieldChunkOffsets, nil
  504. }
  505. func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter,
  506. chunkFactor uint32) (uint64, error) {
  507. fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor)
  508. if err != nil {
  509. return 0, err
  510. }
  511. fieldDocValuesOffset := uint64(w.Count())
  512. buf := make([]byte, binary.MaxVarintLen64)
  513. offset := uint64(0)
  514. ok := true
  515. for fieldID := range memSegment.FieldsInv {
  516. // if the field isn't configured for docValue, then mark
  517. // the offset accordingly
  518. if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok {
  519. offset = fieldNotUninverted
  520. }
  521. n := binary.PutUvarint(buf, uint64(offset))
  522. _, err := w.Write(buf[:n])
  523. if err != nil {
  524. return 0, err
  525. }
  526. }
  527. return fieldDocValuesOffset, nil
  528. }
  529. func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) {
  530. var br bytes.Buffer
  531. cr := NewCountHashWriter(&br)
  532. numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err :=
  533. persistBase(memSegment, cr, chunkFactor)
  534. if err != nil {
  535. return nil, err
  536. }
  537. return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor,
  538. memSegment.FieldsMap, memSegment.FieldsInv, numDocs,
  539. storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs)
  540. }
  541. func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
  542. fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64,
  543. storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64,
  544. dictLocs []uint64) (*SegmentBase, error) {
  545. sb := &SegmentBase{
  546. mem: mem,
  547. memCRC: memCRC,
  548. chunkFactor: chunkFactor,
  549. fieldsMap: fieldsMap,
  550. fieldsInv: fieldsInv,
  551. numDocs: numDocs,
  552. storedIndexOffset: storedIndexOffset,
  553. fieldsIndexOffset: fieldsIndexOffset,
  554. docValueOffset: docValueOffset,
  555. dictLocs: dictLocs,
  556. fieldDvIterMap: make(map[uint16]*docValueIterator),
  557. }
  558. err := sb.loadDvIterators()
  559. if err != nil {
  560. return nil, err
  561. }
  562. return sb, nil
  563. }