You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

optimize.go 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. // Copyright (c) 2018 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package scorch
  15. import (
  16. "fmt"
  17. "github.com/RoaringBitmap/roaring"
  18. index "github.com/blevesearch/bleve_index_api"
  19. segment "github.com/blevesearch/scorch_segment_api/v2"
  20. "sync/atomic"
  21. )
  22. var OptimizeConjunction = true
  23. var OptimizeConjunctionUnadorned = true
  24. var OptimizeDisjunctionUnadorned = true
  25. func (s *IndexSnapshotTermFieldReader) Optimize(kind string,
  26. octx index.OptimizableContext) (index.OptimizableContext, error) {
  27. if OptimizeConjunction && kind == "conjunction" {
  28. return s.optimizeConjunction(octx)
  29. }
  30. if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" {
  31. return s.optimizeConjunctionUnadorned(octx)
  32. }
  33. if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" {
  34. return s.optimizeDisjunctionUnadorned(octx)
  35. }
  36. return nil, nil
  37. }
  38. var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256)
  39. // ----------------------------------------------------------------
  40. func (s *IndexSnapshotTermFieldReader) optimizeConjunction(
  41. octx index.OptimizableContext) (index.OptimizableContext, error) {
  42. if octx == nil {
  43. octx = &OptimizeTFRConjunction{snapshot: s.snapshot}
  44. }
  45. o, ok := octx.(*OptimizeTFRConjunction)
  46. if !ok {
  47. return octx, nil
  48. }
  49. if o.snapshot != s.snapshot {
  50. return nil, fmt.Errorf("tried to optimize conjunction across different snapshots")
  51. }
  52. o.tfrs = append(o.tfrs, s)
  53. return o, nil
  54. }
  55. type OptimizeTFRConjunction struct {
  56. snapshot *IndexSnapshot
  57. tfrs []*IndexSnapshotTermFieldReader
  58. }
  59. func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) {
  60. if len(o.tfrs) <= 1 {
  61. return nil, nil
  62. }
  63. for i := range o.snapshot.segment {
  64. itr0, ok := o.tfrs[0].iterators[i].(segment.OptimizablePostingsIterator)
  65. if !ok || itr0.ActualBitmap() == nil {
  66. continue
  67. }
  68. itr1, ok := o.tfrs[1].iterators[i].(segment.OptimizablePostingsIterator)
  69. if !ok || itr1.ActualBitmap() == nil {
  70. continue
  71. }
  72. bm := roaring.And(itr0.ActualBitmap(), itr1.ActualBitmap())
  73. for _, tfr := range o.tfrs[2:] {
  74. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  75. if !ok || itr.ActualBitmap() == nil {
  76. continue
  77. }
  78. bm.And(itr.ActualBitmap())
  79. }
  80. // in this conjunction optimization, the postings iterators
  81. // will all share the same AND'ed together actual bitmap. The
  82. // regular conjunction searcher machinery will still be used,
  83. // but the underlying bitmap will be smaller.
  84. for _, tfr := range o.tfrs {
  85. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  86. if ok && itr.ActualBitmap() != nil {
  87. itr.ReplaceActual(bm)
  88. }
  89. }
  90. }
  91. return nil, nil
  92. }
  93. // ----------------------------------------------------------------
  94. // An "unadorned" conjunction optimization is appropriate when
  95. // additional or subsidiary information like freq-norm's and
  96. // term-vectors are not required, and instead only the internal-id's
  97. // are needed.
  98. func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned(
  99. octx index.OptimizableContext) (index.OptimizableContext, error) {
  100. if octx == nil {
  101. octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot}
  102. }
  103. o, ok := octx.(*OptimizeTFRConjunctionUnadorned)
  104. if !ok {
  105. return nil, nil
  106. }
  107. if o.snapshot != s.snapshot {
  108. return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots")
  109. }
  110. o.tfrs = append(o.tfrs, s)
  111. return o, nil
  112. }
  113. type OptimizeTFRConjunctionUnadorned struct {
  114. snapshot *IndexSnapshot
  115. tfrs []*IndexSnapshotTermFieldReader
  116. }
  117. var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>")
  118. var OptimizeTFRConjunctionUnadornedField = "*"
  119. // Finish of an unadorned conjunction optimization will compute a
  120. // termFieldReader with an "actual" bitmap that represents the
  121. // constituent bitmaps AND'ed together. This termFieldReader cannot
  122. // provide any freq-norm or termVector associated information.
  123. func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) {
  124. if len(o.tfrs) <= 1 {
  125. return nil, nil
  126. }
  127. // We use an artificial term and field because the optimized
  128. // termFieldReader can represent multiple terms and fields.
  129. oTFR := o.snapshot.unadornedTermFieldReader(
  130. OptimizeTFRConjunctionUnadornedTerm, OptimizeTFRConjunctionUnadornedField)
  131. var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
  132. OUTER:
  133. for i := range o.snapshot.segment {
  134. actualBMs = actualBMs[:0]
  135. var docNum1HitLast uint64
  136. var docNum1HitLastOk bool
  137. for _, tfr := range o.tfrs {
  138. if _, ok := tfr.iterators[i].(*emptyPostingsIterator); ok {
  139. // An empty postings iterator means the entire AND is empty.
  140. oTFR.iterators[i] = anEmptyPostingsIterator
  141. continue OUTER
  142. }
  143. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  144. if !ok {
  145. // We only optimize postings iterators that support this operation.
  146. return nil, nil
  147. }
  148. // If the postings iterator is "1-hit" optimized, then we
  149. // can perform several optimizations up-front here.
  150. docNum1Hit, ok := itr.DocNum1Hit()
  151. if ok {
  152. if docNum1HitLastOk && docNum1HitLast != docNum1Hit {
  153. // The docNum1Hit doesn't match the previous
  154. // docNum1HitLast, so the entire AND is empty.
  155. oTFR.iterators[i] = anEmptyPostingsIterator
  156. continue OUTER
  157. }
  158. docNum1HitLast = docNum1Hit
  159. docNum1HitLastOk = true
  160. continue
  161. }
  162. if itr.ActualBitmap() == nil {
  163. // An empty actual bitmap means the entire AND is empty.
  164. oTFR.iterators[i] = anEmptyPostingsIterator
  165. continue OUTER
  166. }
  167. // Collect the actual bitmap for more processing later.
  168. actualBMs = append(actualBMs, itr.ActualBitmap())
  169. }
  170. if docNum1HitLastOk {
  171. // We reach here if all the 1-hit optimized posting
  172. // iterators had the same 1-hit docNum, so we can check if
  173. // our collected actual bitmaps also have that docNum.
  174. for _, bm := range actualBMs {
  175. if !bm.Contains(uint32(docNum1HitLast)) {
  176. // The docNum1Hit isn't in one of our actual
  177. // bitmaps, so the entire AND is empty.
  178. oTFR.iterators[i] = anEmptyPostingsIterator
  179. continue OUTER
  180. }
  181. }
  182. // The actual bitmaps and docNum1Hits all contain or have
  183. // the same 1-hit docNum, so that's our AND'ed result.
  184. oTFR.iterators[i] = newUnadornedPostingsIteratorFrom1Hit(docNum1HitLast)
  185. continue OUTER
  186. }
  187. if len(actualBMs) == 0 {
  188. // If we've collected no actual bitmaps at this point,
  189. // then the entire AND is empty.
  190. oTFR.iterators[i] = anEmptyPostingsIterator
  191. continue OUTER
  192. }
  193. if len(actualBMs) == 1 {
  194. // If we've only 1 actual bitmap, then that's our result.
  195. oTFR.iterators[i] = newUnadornedPostingsIteratorFromBitmap(actualBMs[0])
  196. continue OUTER
  197. }
  198. // Else, AND together our collected bitmaps as our result.
  199. bm := roaring.And(actualBMs[0], actualBMs[1])
  200. for _, actualBM := range actualBMs[2:] {
  201. bm.And(actualBM)
  202. }
  203. oTFR.iterators[i] = newUnadornedPostingsIteratorFromBitmap(bm)
  204. }
  205. atomic.AddUint64(&o.snapshot.parent.stats.TotTermSearchersStarted, uint64(1))
  206. return oTFR, nil
  207. }
  208. // ----------------------------------------------------------------
  209. // An "unadorned" disjunction optimization is appropriate when
  210. // additional or subsidiary information like freq-norm's and
  211. // term-vectors are not required, and instead only the internal-id's
  212. // are needed.
  213. func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned(
  214. octx index.OptimizableContext) (index.OptimizableContext, error) {
  215. if octx == nil {
  216. octx = &OptimizeTFRDisjunctionUnadorned{
  217. snapshot: s.snapshot,
  218. }
  219. }
  220. o, ok := octx.(*OptimizeTFRDisjunctionUnadorned)
  221. if !ok {
  222. return nil, nil
  223. }
  224. if o.snapshot != s.snapshot {
  225. return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots")
  226. }
  227. o.tfrs = append(o.tfrs, s)
  228. return o, nil
  229. }
  230. type OptimizeTFRDisjunctionUnadorned struct {
  231. snapshot *IndexSnapshot
  232. tfrs []*IndexSnapshotTermFieldReader
  233. }
  234. var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>")
  235. var OptimizeTFRDisjunctionUnadornedField = "*"
  236. // Finish of an unadorned disjunction optimization will compute a
  237. // termFieldReader with an "actual" bitmap that represents the
  238. // constituent bitmaps OR'ed together. This termFieldReader cannot
  239. // provide any freq-norm or termVector associated information.
  240. func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) {
  241. if len(o.tfrs) <= 1 {
  242. return nil, nil
  243. }
  244. for i := range o.snapshot.segment {
  245. var cMax uint64
  246. for _, tfr := range o.tfrs {
  247. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  248. if !ok {
  249. return nil, nil
  250. }
  251. if itr.ActualBitmap() != nil {
  252. c := itr.ActualBitmap().GetCardinality()
  253. if cMax < c {
  254. cMax = c
  255. }
  256. }
  257. }
  258. }
  259. // We use an artificial term and field because the optimized
  260. // termFieldReader can represent multiple terms and fields.
  261. oTFR := o.snapshot.unadornedTermFieldReader(
  262. OptimizeTFRDisjunctionUnadornedTerm, OptimizeTFRDisjunctionUnadornedField)
  263. var docNums []uint32 // Collected docNum's from 1-hit posting lists.
  264. var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
  265. for i := range o.snapshot.segment {
  266. docNums = docNums[:0]
  267. actualBMs = actualBMs[:0]
  268. for _, tfr := range o.tfrs {
  269. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  270. if !ok {
  271. return nil, nil
  272. }
  273. docNum, ok := itr.DocNum1Hit()
  274. if ok {
  275. docNums = append(docNums, uint32(docNum))
  276. continue
  277. }
  278. if itr.ActualBitmap() != nil {
  279. actualBMs = append(actualBMs, itr.ActualBitmap())
  280. }
  281. }
  282. var bm *roaring.Bitmap
  283. if len(actualBMs) > 2 {
  284. bm = roaring.HeapOr(actualBMs...)
  285. } else if len(actualBMs) == 2 {
  286. bm = roaring.Or(actualBMs[0], actualBMs[1])
  287. } else if len(actualBMs) == 1 {
  288. bm = actualBMs[0].Clone()
  289. }
  290. if bm == nil {
  291. bm = roaring.New()
  292. }
  293. bm.AddMany(docNums)
  294. oTFR.iterators[i] = newUnadornedPostingsIteratorFromBitmap(bm)
  295. }
  296. atomic.AddUint64(&o.snapshot.parent.stats.TotTermSearchersStarted, uint64(1))
  297. return oTFR, nil
  298. }
  299. // ----------------------------------------------------------------
  300. func (i *IndexSnapshot) unadornedTermFieldReader(
  301. term []byte, field string) *IndexSnapshotTermFieldReader {
  302. // This IndexSnapshotTermFieldReader will not be recycled, more
  303. // conversation here: https://github.com/blevesearch/bleve/pull/1438
  304. return &IndexSnapshotTermFieldReader{
  305. term: term,
  306. field: field,
  307. snapshot: i,
  308. iterators: make([]segment.PostingsIterator, len(i.segment)),
  309. segmentOffset: 0,
  310. includeFreq: false,
  311. includeNorm: false,
  312. includeTermVectors: false,
  313. recycle: false,
  314. }
  315. }