You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

posting.go 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package zap
  15. import (
  16. "encoding/binary"
  17. "fmt"
  18. "math"
  19. "reflect"
  20. "github.com/RoaringBitmap/roaring"
  21. "github.com/blevesearch/bleve/index/scorch/segment"
  22. "github.com/blevesearch/bleve/size"
  23. )
  24. var reflectStaticSizePostingsList int
  25. var reflectStaticSizePostingsIterator int
  26. var reflectStaticSizePosting int
  27. var reflectStaticSizeLocation int
  28. func init() {
  29. var pl PostingsList
  30. reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size())
  31. var pi PostingsIterator
  32. reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size())
  33. var p Posting
  34. reflectStaticSizePosting = int(reflect.TypeOf(p).Size())
  35. var l Location
  36. reflectStaticSizeLocation = int(reflect.TypeOf(l).Size())
  37. }
  38. // FST or vellum value (uint64) encoding is determined by the top two
  39. // highest-order or most significant bits...
  40. //
  41. // encoding : MSB
  42. // name : 63 62 61...to...bit #0 (LSB)
  43. // ----------+---+---+---------------------------------------------------
  44. // general : 0 | 0 | 62-bits of postingsOffset.
  45. // ~ : 0 | 1 | reserved for future.
  46. // 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum.
  47. // ~ : 1 | 1 | reserved for future.
  48. //
  49. // Encoding "general" is able to handle all cases, where the
  50. // postingsOffset points to more information about the postings for
  51. // the term.
  52. //
  53. // Encoding "1-hit" is used to optimize a commonly seen case when a
  54. // term has only a single hit. For example, a term in the _id field
  55. // will have only 1 hit. The "1-hit" encoding is used for a term
  56. // in a field when...
  57. //
  58. // - term vector info is disabled for that field;
  59. // - and, the term appears in only a single doc for that field;
  60. // - and, the term's freq is exactly 1 in that single doc for that field;
  61. // - and, the docNum must fit into 31-bits;
  62. //
  63. // Otherwise, the "general" encoding is used instead.
  64. //
  65. // In the "1-hit" encoding, the field in that single doc may have
  66. // other terms, which is supported in the "1-hit" encoding by the
  67. // positive float31 norm.
  68. const FSTValEncodingMask = uint64(0xc000000000000000)
  69. const FSTValEncodingGeneral = uint64(0x0000000000000000)
  70. const FSTValEncoding1Hit = uint64(0x8000000000000000)
  71. func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 {
  72. return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum)
  73. }
  74. func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) {
  75. return (mask31Bits & v), (mask31Bits & (v >> 31))
  76. }
  77. const mask31Bits = uint64(0x000000007fffffff)
  78. func under32Bits(x uint64) bool {
  79. return x <= mask31Bits
  80. }
  81. const DocNum1HitFinished = math.MaxUint64
  82. var NormBits1Hit = uint64(math.Float32bits(float32(1)))
  83. // PostingsList is an in-memory representation of a postings list
  84. type PostingsList struct {
  85. sb *SegmentBase
  86. postingsOffset uint64
  87. freqOffset uint64
  88. locOffset uint64
  89. postings *roaring.Bitmap
  90. except *roaring.Bitmap
  91. // when normBits1Hit != 0, then this postings list came from a
  92. // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply
  93. docNum1Hit uint64
  94. normBits1Hit uint64
  95. }
  96. // represents an immutable, empty postings list
  97. var emptyPostingsList = &PostingsList{}
  98. func (p *PostingsList) Size() int {
  99. sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr
  100. if p.except != nil {
  101. sizeInBytes += int(p.except.GetSizeInBytes())
  102. }
  103. return sizeInBytes
  104. }
  105. func (p *PostingsList) OrInto(receiver *roaring.Bitmap) {
  106. if p.normBits1Hit != 0 {
  107. receiver.Add(uint32(p.docNum1Hit))
  108. return
  109. }
  110. if p.postings != nil {
  111. receiver.Or(p.postings)
  112. }
  113. }
  114. // Iterator returns an iterator for this postings list
  115. func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool,
  116. prealloc segment.PostingsIterator) segment.PostingsIterator {
  117. if p.normBits1Hit == 0 && p.postings == nil {
  118. return emptyPostingsIterator
  119. }
  120. var preallocPI *PostingsIterator
  121. pi, ok := prealloc.(*PostingsIterator)
  122. if ok && pi != nil {
  123. preallocPI = pi
  124. }
  125. if preallocPI == emptyPostingsIterator {
  126. preallocPI = nil
  127. }
  128. return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI)
  129. }
  130. func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool,
  131. rv *PostingsIterator) *PostingsIterator {
  132. if rv == nil {
  133. rv = &PostingsIterator{}
  134. } else {
  135. freqNormReader := rv.freqNormReader
  136. if freqNormReader != nil {
  137. freqNormReader.Reset([]byte(nil))
  138. }
  139. locReader := rv.locReader
  140. if locReader != nil {
  141. locReader.Reset([]byte(nil))
  142. }
  143. freqChunkOffsets := rv.freqChunkOffsets[:0]
  144. locChunkOffsets := rv.locChunkOffsets[:0]
  145. nextLocs := rv.nextLocs[:0]
  146. nextSegmentLocs := rv.nextSegmentLocs[:0]
  147. buf := rv.buf
  148. *rv = PostingsIterator{} // clear the struct
  149. rv.freqNormReader = freqNormReader
  150. rv.locReader = locReader
  151. rv.freqChunkOffsets = freqChunkOffsets
  152. rv.locChunkOffsets = locChunkOffsets
  153. rv.nextLocs = nextLocs
  154. rv.nextSegmentLocs = nextSegmentLocs
  155. rv.buf = buf
  156. }
  157. rv.postings = p
  158. rv.includeFreqNorm = includeFreq || includeNorm || includeLocs
  159. rv.includeLocs = includeLocs
  160. if p.normBits1Hit != 0 {
  161. // "1-hit" encoding
  162. rv.docNum1Hit = p.docNum1Hit
  163. rv.normBits1Hit = p.normBits1Hit
  164. if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) {
  165. rv.docNum1Hit = DocNum1HitFinished
  166. }
  167. return rv
  168. }
  169. // "general" encoding, check if empty
  170. if p.postings == nil {
  171. return rv
  172. }
  173. var n uint64
  174. var read int
  175. // prepare the freq chunk details
  176. if rv.includeFreqNorm {
  177. var numFreqChunks uint64
  178. numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
  179. n += uint64(read)
  180. if cap(rv.freqChunkOffsets) >= int(numFreqChunks) {
  181. rv.freqChunkOffsets = rv.freqChunkOffsets[:int(numFreqChunks)]
  182. } else {
  183. rv.freqChunkOffsets = make([]uint64, int(numFreqChunks))
  184. }
  185. for i := 0; i < int(numFreqChunks); i++ {
  186. rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
  187. n += uint64(read)
  188. }
  189. rv.freqChunkStart = p.freqOffset + n
  190. }
  191. // prepare the loc chunk details
  192. if rv.includeLocs {
  193. n = 0
  194. var numLocChunks uint64
  195. numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
  196. n += uint64(read)
  197. if cap(rv.locChunkOffsets) >= int(numLocChunks) {
  198. rv.locChunkOffsets = rv.locChunkOffsets[:int(numLocChunks)]
  199. } else {
  200. rv.locChunkOffsets = make([]uint64, int(numLocChunks))
  201. }
  202. for i := 0; i < int(numLocChunks); i++ {
  203. rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
  204. n += uint64(read)
  205. }
  206. rv.locChunkStart = p.locOffset + n
  207. }
  208. rv.all = p.postings.Iterator()
  209. if p.except != nil {
  210. rv.ActualBM = roaring.AndNot(p.postings, p.except)
  211. rv.Actual = rv.ActualBM.Iterator()
  212. } else {
  213. rv.ActualBM = p.postings
  214. rv.Actual = rv.all // Optimize to use same iterator for all & Actual.
  215. }
  216. return rv
  217. }
  218. // Count returns the number of items on this postings list
  219. func (p *PostingsList) Count() uint64 {
  220. var n, e uint64
  221. if p.normBits1Hit != 0 {
  222. n = 1
  223. if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) {
  224. e = 1
  225. }
  226. } else if p.postings != nil {
  227. n = p.postings.GetCardinality()
  228. if p.except != nil {
  229. e = p.postings.AndCardinality(p.except)
  230. }
  231. }
  232. return n - e
  233. }
  234. func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
  235. rv.postingsOffset = postingsOffset
  236. // handle "1-hit" encoding special case
  237. if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit {
  238. return rv.init1Hit(postingsOffset)
  239. }
  240. // read the location of the freq/norm details
  241. var n uint64
  242. var read int
  243. rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64])
  244. n += uint64(read)
  245. rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
  246. n += uint64(read)
  247. var postingsLen uint64
  248. postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
  249. n += uint64(read)
  250. roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]
  251. if rv.postings == nil {
  252. rv.postings = roaring.NewBitmap()
  253. }
  254. _, err := rv.postings.FromBuffer(roaringBytes)
  255. if err != nil {
  256. return fmt.Errorf("error loading roaring bitmap: %v", err)
  257. }
  258. return nil
  259. }
  260. func (rv *PostingsList) init1Hit(fstVal uint64) error {
  261. docNum, normBits := FSTValDecode1Hit(fstVal)
  262. rv.docNum1Hit = docNum
  263. rv.normBits1Hit = normBits
  264. return nil
  265. }
  266. // PostingsIterator provides a way to iterate through the postings list
  267. type PostingsIterator struct {
  268. postings *PostingsList
  269. all roaring.IntPeekable
  270. Actual roaring.IntPeekable
  271. ActualBM *roaring.Bitmap
  272. currChunk uint32
  273. currChunkFreqNorm []byte
  274. currChunkLoc []byte
  275. freqNormReader *segment.MemUvarintReader
  276. locReader *segment.MemUvarintReader
  277. freqChunkOffsets []uint64
  278. freqChunkStart uint64
  279. locChunkOffsets []uint64
  280. locChunkStart uint64
  281. next Posting // reused across Next() calls
  282. nextLocs []Location // reused across Next() calls
  283. nextSegmentLocs []segment.Location // reused across Next() calls
  284. docNum1Hit uint64
  285. normBits1Hit uint64
  286. buf []byte
  287. includeFreqNorm bool
  288. includeLocs bool
  289. }
  290. var emptyPostingsIterator = &PostingsIterator{}
  291. func (i *PostingsIterator) Size() int {
  292. sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr +
  293. len(i.currChunkFreqNorm) +
  294. len(i.currChunkLoc) +
  295. len(i.freqChunkOffsets)*size.SizeOfUint64 +
  296. len(i.locChunkOffsets)*size.SizeOfUint64 +
  297. i.next.Size()
  298. for _, entry := range i.nextLocs {
  299. sizeInBytes += entry.Size()
  300. }
  301. return sizeInBytes
  302. }
  303. func (i *PostingsIterator) loadChunk(chunk int) error {
  304. if i.includeFreqNorm {
  305. if chunk >= len(i.freqChunkOffsets) {
  306. return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)",
  307. chunk, len(i.freqChunkOffsets))
  308. }
  309. end, start := i.freqChunkStart, i.freqChunkStart
  310. s, e := readChunkBoundary(chunk, i.freqChunkOffsets)
  311. start += s
  312. end += e
  313. i.currChunkFreqNorm = i.postings.sb.mem[start:end]
  314. if i.freqNormReader == nil {
  315. i.freqNormReader = segment.NewMemUvarintReader(i.currChunkFreqNorm)
  316. } else {
  317. i.freqNormReader.Reset(i.currChunkFreqNorm)
  318. }
  319. }
  320. if i.includeLocs {
  321. if chunk >= len(i.locChunkOffsets) {
  322. return fmt.Errorf("tried to load loc chunk that doesn't exist %d/(%d)",
  323. chunk, len(i.locChunkOffsets))
  324. }
  325. end, start := i.locChunkStart, i.locChunkStart
  326. s, e := readChunkBoundary(chunk, i.locChunkOffsets)
  327. start += s
  328. end += e
  329. i.currChunkLoc = i.postings.sb.mem[start:end]
  330. if i.locReader == nil {
  331. i.locReader = segment.NewMemUvarintReader(i.currChunkLoc)
  332. } else {
  333. i.locReader.Reset(i.currChunkLoc)
  334. }
  335. }
  336. i.currChunk = uint32(chunk)
  337. return nil
  338. }
  339. func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) {
  340. if i.normBits1Hit != 0 {
  341. return 1, i.normBits1Hit, false, nil
  342. }
  343. freqHasLocs, err := i.freqNormReader.ReadUvarint()
  344. if err != nil {
  345. return 0, 0, false, fmt.Errorf("error reading frequency: %v", err)
  346. }
  347. freq, hasLocs := decodeFreqHasLocs(freqHasLocs)
  348. normBits, err := i.freqNormReader.ReadUvarint()
  349. if err != nil {
  350. return 0, 0, false, fmt.Errorf("error reading norm: %v", err)
  351. }
  352. return freq, normBits, hasLocs, nil
  353. }
  354. func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) {
  355. if i.normBits1Hit != 0 {
  356. return false, nil
  357. }
  358. freqHasLocs, err := i.freqNormReader.ReadUvarint()
  359. if err != nil {
  360. return false, fmt.Errorf("error reading freqHasLocs: %v", err)
  361. }
  362. i.freqNormReader.SkipUvarint() // Skip normBits.
  363. return freqHasLocs&0x01 != 0, nil // See decodeFreqHasLocs() / hasLocs.
  364. }
  365. func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 {
  366. rv := freq << 1
  367. if hasLocs {
  368. rv = rv | 0x01 // 0'th LSB encodes whether there are locations
  369. }
  370. return rv
  371. }
  372. func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) {
  373. freq := freqHasLocs >> 1
  374. hasLocs := freqHasLocs&0x01 != 0
  375. return freq, hasLocs
  376. }
  377. // readLocation processes all the integers on the stream representing a single
  378. // location.
  379. func (i *PostingsIterator) readLocation(l *Location) error {
  380. // read off field
  381. fieldID, err := i.locReader.ReadUvarint()
  382. if err != nil {
  383. return fmt.Errorf("error reading location field: %v", err)
  384. }
  385. // read off pos
  386. pos, err := i.locReader.ReadUvarint()
  387. if err != nil {
  388. return fmt.Errorf("error reading location pos: %v", err)
  389. }
  390. // read off start
  391. start, err := i.locReader.ReadUvarint()
  392. if err != nil {
  393. return fmt.Errorf("error reading location start: %v", err)
  394. }
  395. // read off end
  396. end, err := i.locReader.ReadUvarint()
  397. if err != nil {
  398. return fmt.Errorf("error reading location end: %v", err)
  399. }
  400. // read off num array pos
  401. numArrayPos, err := i.locReader.ReadUvarint()
  402. if err != nil {
  403. return fmt.Errorf("error reading location num array pos: %v", err)
  404. }
  405. l.field = i.postings.sb.fieldsInv[fieldID]
  406. l.pos = pos
  407. l.start = start
  408. l.end = end
  409. if cap(l.ap) < int(numArrayPos) {
  410. l.ap = make([]uint64, int(numArrayPos))
  411. } else {
  412. l.ap = l.ap[:int(numArrayPos)]
  413. }
  414. // read off array positions
  415. for k := 0; k < int(numArrayPos); k++ {
  416. ap, err := i.locReader.ReadUvarint()
  417. if err != nil {
  418. return fmt.Errorf("error reading array position: %v", err)
  419. }
  420. l.ap[k] = ap
  421. }
  422. return nil
  423. }
  424. // Next returns the next posting on the postings list, or nil at the end
  425. func (i *PostingsIterator) Next() (segment.Posting, error) {
  426. return i.nextAtOrAfter(0)
  427. }
  428. // Advance returns the posting at the specified docNum or it is not present
  429. // the next posting, or if the end is reached, nil
  430. func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) {
  431. return i.nextAtOrAfter(docNum)
  432. }
  433. // Next returns the next posting on the postings list, or nil at the end
  434. func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) {
  435. docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter)
  436. if err != nil || !exists {
  437. return nil, err
  438. }
  439. i.next = Posting{} // clear the struct
  440. rv := &i.next
  441. rv.docNum = docNum
  442. if !i.includeFreqNorm {
  443. return rv, nil
  444. }
  445. var normBits uint64
  446. var hasLocs bool
  447. rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
  448. if err != nil {
  449. return nil, err
  450. }
  451. rv.norm = math.Float32frombits(uint32(normBits))
  452. if i.includeLocs && hasLocs {
  453. // prepare locations into reused slices, where we assume
  454. // rv.freq >= "number of locs", since in a composite field,
  455. // some component fields might have their IncludeTermVector
  456. // flags disabled while other component fields are enabled
  457. if cap(i.nextLocs) >= int(rv.freq) {
  458. i.nextLocs = i.nextLocs[0:rv.freq]
  459. } else {
  460. i.nextLocs = make([]Location, rv.freq, rv.freq*2)
  461. }
  462. if cap(i.nextSegmentLocs) < int(rv.freq) {
  463. i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2)
  464. }
  465. rv.locs = i.nextSegmentLocs[:0]
  466. numLocsBytes, err := i.locReader.ReadUvarint()
  467. if err != nil {
  468. return nil, fmt.Errorf("error reading location numLocsBytes: %v", err)
  469. }
  470. j := 0
  471. startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader
  472. for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) {
  473. err := i.readLocation(&i.nextLocs[j])
  474. if err != nil {
  475. return nil, err
  476. }
  477. rv.locs = append(rv.locs, &i.nextLocs[j])
  478. j++
  479. }
  480. }
  481. return rv, nil
  482. }
  483. var freqHasLocs1Hit = encodeFreqHasLocs(1, false)
  484. // nextBytes returns the docNum and the encoded freq & loc bytes for
  485. // the next posting
  486. func (i *PostingsIterator) nextBytes() (
  487. docNumOut uint64, freq uint64, normBits uint64,
  488. bytesFreqNorm []byte, bytesLoc []byte, err error) {
  489. docNum, exists, err := i.nextDocNumAtOrAfter(0)
  490. if err != nil || !exists {
  491. return 0, 0, 0, nil, nil, err
  492. }
  493. if i.normBits1Hit != 0 {
  494. if i.buf == nil {
  495. i.buf = make([]byte, binary.MaxVarintLen64*2)
  496. }
  497. n := binary.PutUvarint(i.buf, freqHasLocs1Hit)
  498. n += binary.PutUvarint(i.buf[n:], i.normBits1Hit)
  499. return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil
  500. }
  501. startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
  502. var hasLocs bool
  503. freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
  504. if err != nil {
  505. return 0, 0, 0, nil, nil, err
  506. }
  507. endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
  508. bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm]
  509. if hasLocs {
  510. startLoc := len(i.currChunkLoc) - i.locReader.Len()
  511. numLocsBytes, err := i.locReader.ReadUvarint()
  512. if err != nil {
  513. return 0, 0, 0, nil, nil,
  514. fmt.Errorf("error reading location nextBytes numLocs: %v", err)
  515. }
  516. // skip over all the location bytes
  517. i.locReader.SkipBytes(int(numLocsBytes))
  518. endLoc := len(i.currChunkLoc) - i.locReader.Len()
  519. bytesLoc = i.currChunkLoc[startLoc:endLoc]
  520. }
  521. return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil
  522. }
  523. // nextDocNum returns the next docNum on the postings list, and also
  524. // sets up the currChunk / loc related fields of the iterator.
  525. func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) {
  526. if i.normBits1Hit != 0 {
  527. if i.docNum1Hit == DocNum1HitFinished {
  528. return 0, false, nil
  529. }
  530. if i.docNum1Hit < atOrAfter {
  531. // advanced past our 1-hit
  532. i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum
  533. return 0, false, nil
  534. }
  535. docNum := i.docNum1Hit
  536. i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum
  537. return docNum, true, nil
  538. }
  539. if i.Actual == nil || !i.Actual.HasNext() {
  540. return 0, false, nil
  541. }
  542. if i.postings == nil || i.postings.postings == i.ActualBM {
  543. return i.nextDocNumAtOrAfterClean(atOrAfter)
  544. }
  545. i.Actual.AdvanceIfNeeded(uint32(atOrAfter))
  546. if !i.Actual.HasNext() {
  547. // couldn't find anything
  548. return 0, false, nil
  549. }
  550. n := i.Actual.Next()
  551. allN := i.all.Next()
  552. nChunk := n / i.postings.sb.chunkFactor
  553. // when allN becomes >= to here, then allN is in the same chunk as nChunk.
  554. allNReachesNChunk := nChunk * i.postings.sb.chunkFactor
  555. // n is the next actual hit (excluding some postings), and
  556. // allN is the next hit in the full postings, and
  557. // if they don't match, move 'all' forwards until they do
  558. for allN != n {
  559. // we've reached same chunk, so move the freq/norm/loc decoders forward
  560. if i.includeFreqNorm && allN >= allNReachesNChunk {
  561. err := i.currChunkNext(nChunk)
  562. if err != nil {
  563. return 0, false, err
  564. }
  565. }
  566. allN = i.all.Next()
  567. }
  568. if i.includeFreqNorm && (i.currChunk != nChunk || i.currChunkFreqNorm == nil) {
  569. err := i.loadChunk(int(nChunk))
  570. if err != nil {
  571. return 0, false, fmt.Errorf("error loading chunk: %v", err)
  572. }
  573. }
  574. return uint64(n), true, nil
  575. }
  576. // optimization when the postings list is "clean" (e.g., no updates &
  577. // no deletions) where the all bitmap is the same as the actual bitmap
  578. func (i *PostingsIterator) nextDocNumAtOrAfterClean(
  579. atOrAfter uint64) (uint64, bool, error) {
  580. if !i.includeFreqNorm {
  581. i.Actual.AdvanceIfNeeded(uint32(atOrAfter))
  582. if !i.Actual.HasNext() {
  583. return 0, false, nil // couldn't find anything
  584. }
  585. return uint64(i.Actual.Next()), true, nil
  586. }
  587. // freq-norm's needed, so maintain freq-norm chunk reader
  588. sameChunkNexts := 0 // # of times we called Next() in the same chunk
  589. n := i.Actual.Next()
  590. nChunk := n / i.postings.sb.chunkFactor
  591. for uint64(n) < atOrAfter && i.Actual.HasNext() {
  592. n = i.Actual.Next()
  593. nChunkPrev := nChunk
  594. nChunk = n / i.postings.sb.chunkFactor
  595. if nChunk != nChunkPrev {
  596. sameChunkNexts = 0
  597. } else {
  598. sameChunkNexts += 1
  599. }
  600. }
  601. if uint64(n) < atOrAfter {
  602. // couldn't find anything
  603. return 0, false, nil
  604. }
  605. for j := 0; j < sameChunkNexts; j++ {
  606. err := i.currChunkNext(nChunk)
  607. if err != nil {
  608. return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err)
  609. }
  610. }
  611. if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
  612. err := i.loadChunk(int(nChunk))
  613. if err != nil {
  614. return 0, false, fmt.Errorf("error loading chunk: %v", err)
  615. }
  616. }
  617. return uint64(n), true, nil
  618. }
  619. func (i *PostingsIterator) currChunkNext(nChunk uint32) error {
  620. if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
  621. err := i.loadChunk(int(nChunk))
  622. if err != nil {
  623. return fmt.Errorf("error loading chunk: %v", err)
  624. }
  625. }
  626. // read off freq/offsets even though we don't care about them
  627. hasLocs, err := i.skipFreqNormReadHasLocs()
  628. if err != nil {
  629. return err
  630. }
  631. if i.includeLocs && hasLocs {
  632. numLocsBytes, err := i.locReader.ReadUvarint()
  633. if err != nil {
  634. return fmt.Errorf("error reading location numLocsBytes: %v", err)
  635. }
  636. // skip over all the location bytes
  637. i.locReader.SkipBytes(int(numLocsBytes))
  638. }
  639. return nil
  640. }
  641. // DocNum1Hit returns the docNum and true if this is "1-hit" optimized
  642. // and the docNum is available.
  643. func (p *PostingsIterator) DocNum1Hit() (uint64, bool) {
  644. if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished {
  645. return p.docNum1Hit, true
  646. }
  647. return 0, false
  648. }
  649. // ActualBitmap returns the underlying actual bitmap
  650. // which can be used up the stack for optimizations
  651. func (p *PostingsIterator) ActualBitmap() *roaring.Bitmap {
  652. return p.ActualBM
  653. }
  654. // ReplaceActual replaces the ActualBM with the provided
  655. // bitmap
  656. func (p *PostingsIterator) ReplaceActual(abm *roaring.Bitmap) {
  657. p.ActualBM = abm
  658. p.Actual = abm.Iterator()
  659. }
  660. // PostingsIteratorFromBitmap constructs a PostingsIterator given an
  661. // "actual" bitmap.
  662. func PostingsIteratorFromBitmap(bm *roaring.Bitmap,
  663. includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) {
  664. return &PostingsIterator{
  665. ActualBM: bm,
  666. Actual: bm.Iterator(),
  667. includeFreqNorm: includeFreqNorm,
  668. includeLocs: includeLocs,
  669. }, nil
  670. }
  671. // PostingsIteratorFrom1Hit constructs a PostingsIterator given a
  672. // 1-hit docNum.
  673. func PostingsIteratorFrom1Hit(docNum1Hit uint64,
  674. includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) {
  675. return &PostingsIterator{
  676. docNum1Hit: docNum1Hit,
  677. normBits1Hit: NormBits1Hit,
  678. includeFreqNorm: includeFreqNorm,
  679. includeLocs: includeLocs,
  680. }, nil
  681. }
  682. // Posting is a single entry in a postings list
  683. type Posting struct {
  684. docNum uint64
  685. freq uint64
  686. norm float32
  687. locs []segment.Location
  688. }
  689. func (p *Posting) Size() int {
  690. sizeInBytes := reflectStaticSizePosting
  691. for _, entry := range p.locs {
  692. sizeInBytes += entry.Size()
  693. }
  694. return sizeInBytes
  695. }
  696. // Number returns the document number of this posting in this segment
  697. func (p *Posting) Number() uint64 {
  698. return p.docNum
  699. }
  700. // Frequency returns the frequencies of occurrence of this term in this doc/field
  701. func (p *Posting) Frequency() uint64 {
  702. return p.freq
  703. }
  704. // Norm returns the normalization factor for this posting
  705. func (p *Posting) Norm() float64 {
  706. return float64(p.norm)
  707. }
  708. // Locations returns the location information for each occurrence
  709. func (p *Posting) Locations() []segment.Location {
  710. return p.locs
  711. }
  712. // Location represents the location of a single occurrence
  713. type Location struct {
  714. field string
  715. pos uint64
  716. start uint64
  717. end uint64
  718. ap []uint64
  719. }
  720. func (l *Location) Size() int {
  721. return reflectStaticSizeLocation +
  722. len(l.field) +
  723. len(l.ap)*size.SizeOfUint64
  724. }
  725. // Field returns the name of the field (useful in composite fields to know
  726. // which original field the value came from)
  727. func (l *Location) Field() string {
  728. return l.field
  729. }
  730. // Start returns the start byte offset of this occurrence
  731. func (l *Location) Start() uint64 {
  732. return l.start
  733. }
  734. // End returns the end byte offset of this occurrence
  735. func (l *Location) End() uint64 {
  736. return l.end
  737. }
  738. // Pos returns the 1-based phrase position of this occurrence
  739. func (l *Location) Pos() uint64 {
  740. return l.pos
  741. }
  742. // ArrayPositions returns the array position vector associated with this occurrence
  743. func (l *Location) ArrayPositions() []uint64 {
  744. return l.ap
  745. }