You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

optimize.go 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. // Copyright (c) 2018 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package scorch
  15. import (
  16. "fmt"
  17. "github.com/RoaringBitmap/roaring"
  18. "github.com/blevesearch/bleve/index"
  19. "github.com/blevesearch/bleve/index/scorch/segment"
  20. )
  21. var OptimizeConjunction = true
  22. var OptimizeConjunctionUnadorned = true
  23. var OptimizeDisjunctionUnadorned = true
  24. func (s *IndexSnapshotTermFieldReader) Optimize(kind string,
  25. octx index.OptimizableContext) (index.OptimizableContext, error) {
  26. if OptimizeConjunction && kind == "conjunction" {
  27. return s.optimizeConjunction(octx)
  28. }
  29. if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" {
  30. return s.optimizeConjunctionUnadorned(octx)
  31. }
  32. if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" {
  33. return s.optimizeDisjunctionUnadorned(octx)
  34. }
  35. return octx, nil
  36. }
  37. var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256)
  38. // ----------------------------------------------------------------
  39. func (s *IndexSnapshotTermFieldReader) optimizeConjunction(
  40. octx index.OptimizableContext) (index.OptimizableContext, error) {
  41. if octx == nil {
  42. octx = &OptimizeTFRConjunction{snapshot: s.snapshot}
  43. }
  44. o, ok := octx.(*OptimizeTFRConjunction)
  45. if !ok {
  46. return octx, nil
  47. }
  48. if o.snapshot != s.snapshot {
  49. return nil, fmt.Errorf("tried to optimize conjunction across different snapshots")
  50. }
  51. o.tfrs = append(o.tfrs, s)
  52. return o, nil
  53. }
  54. type OptimizeTFRConjunction struct {
  55. snapshot *IndexSnapshot
  56. tfrs []*IndexSnapshotTermFieldReader
  57. }
  58. func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) {
  59. if len(o.tfrs) <= 1 {
  60. return nil, nil
  61. }
  62. for i := range o.snapshot.segment {
  63. itr0, ok := o.tfrs[0].iterators[i].(segment.OptimizablePostingsIterator)
  64. if !ok || itr0.ActualBitmap() == nil {
  65. continue
  66. }
  67. itr1, ok := o.tfrs[1].iterators[i].(segment.OptimizablePostingsIterator)
  68. if !ok || itr1.ActualBitmap() == nil {
  69. continue
  70. }
  71. bm := roaring.And(itr0.ActualBitmap(), itr1.ActualBitmap())
  72. for _, tfr := range o.tfrs[2:] {
  73. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  74. if !ok || itr.ActualBitmap() == nil {
  75. continue
  76. }
  77. bm.And(itr.ActualBitmap())
  78. }
  79. // in this conjunction optimization, the postings iterators
  80. // will all share the same AND'ed together actual bitmap. The
  81. // regular conjunction searcher machinery will still be used,
  82. // but the underlying bitmap will be smaller.
  83. for _, tfr := range o.tfrs {
  84. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  85. if ok && itr.ActualBitmap() != nil {
  86. itr.ReplaceActual(bm)
  87. }
  88. }
  89. }
  90. return nil, nil
  91. }
  92. // ----------------------------------------------------------------
  93. // An "unadorned" conjunction optimization is appropriate when
  94. // additional or subsidiary information like freq-norm's and
  95. // term-vectors are not required, and instead only the internal-id's
  96. // are needed.
  97. func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned(
  98. octx index.OptimizableContext) (index.OptimizableContext, error) {
  99. if octx == nil {
  100. octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot}
  101. }
  102. o, ok := octx.(*OptimizeTFRConjunctionUnadorned)
  103. if !ok {
  104. return nil, nil
  105. }
  106. if o.snapshot != s.snapshot {
  107. return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots")
  108. }
  109. o.tfrs = append(o.tfrs, s)
  110. return o, nil
  111. }
  112. type OptimizeTFRConjunctionUnadorned struct {
  113. snapshot *IndexSnapshot
  114. tfrs []*IndexSnapshotTermFieldReader
  115. }
  116. var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>")
  117. var OptimizeTFRConjunctionUnadornedField = "*"
  118. // Finish of an unadorned conjunction optimization will compute a
  119. // termFieldReader with an "actual" bitmap that represents the
  120. // constituent bitmaps AND'ed together. This termFieldReader cannot
  121. // provide any freq-norm or termVector associated information.
  122. func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) {
  123. if len(o.tfrs) <= 1 {
  124. return nil, nil
  125. }
  126. // We use an artificial term and field because the optimized
  127. // termFieldReader can represent multiple terms and fields.
  128. oTFR := &IndexSnapshotTermFieldReader{
  129. term: OptimizeTFRConjunctionUnadornedTerm,
  130. field: OptimizeTFRConjunctionUnadornedField,
  131. snapshot: o.snapshot,
  132. iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
  133. segmentOffset: 0,
  134. includeFreq: false,
  135. includeNorm: false,
  136. includeTermVectors: false,
  137. }
  138. var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
  139. OUTER:
  140. for i := range o.snapshot.segment {
  141. actualBMs = actualBMs[:0]
  142. var docNum1HitLast uint64
  143. var docNum1HitLastOk bool
  144. for _, tfr := range o.tfrs {
  145. if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok {
  146. // An empty postings iterator means the entire AND is empty.
  147. oTFR.iterators[i] = segment.AnEmptyPostingsIterator
  148. continue OUTER
  149. }
  150. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  151. if !ok {
  152. // We only optimize postings iterators that support this operation.
  153. return nil, nil
  154. }
  155. // If the postings iterator is "1-hit" optimized, then we
  156. // can perform several optimizations up-front here.
  157. docNum1Hit, ok := itr.DocNum1Hit()
  158. if ok {
  159. if docNum1HitLastOk && docNum1HitLast != docNum1Hit {
  160. // The docNum1Hit doesn't match the previous
  161. // docNum1HitLast, so the entire AND is empty.
  162. oTFR.iterators[i] = segment.AnEmptyPostingsIterator
  163. continue OUTER
  164. }
  165. docNum1HitLast = docNum1Hit
  166. docNum1HitLastOk = true
  167. continue
  168. }
  169. if itr.ActualBitmap() == nil {
  170. // An empty actual bitmap means the entire AND is empty.
  171. oTFR.iterators[i] = segment.AnEmptyPostingsIterator
  172. continue OUTER
  173. }
  174. // Collect the actual bitmap for more processing later.
  175. actualBMs = append(actualBMs, itr.ActualBitmap())
  176. }
  177. if docNum1HitLastOk {
  178. // We reach here if all the 1-hit optimized posting
  179. // iterators had the same 1-hit docNum, so we can check if
  180. // our collected actual bitmaps also have that docNum.
  181. for _, bm := range actualBMs {
  182. if !bm.Contains(uint32(docNum1HitLast)) {
  183. // The docNum1Hit isn't in one of our actual
  184. // bitmaps, so the entire AND is empty.
  185. oTFR.iterators[i] = segment.AnEmptyPostingsIterator
  186. continue OUTER
  187. }
  188. }
  189. // The actual bitmaps and docNum1Hits all contain or have
  190. // the same 1-hit docNum, so that's our AND'ed result.
  191. oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFrom1Hit(docNum1HitLast)
  192. continue OUTER
  193. }
  194. if len(actualBMs) == 0 {
  195. // If we've collected no actual bitmaps at this point,
  196. // then the entire AND is empty.
  197. oTFR.iterators[i] = segment.AnEmptyPostingsIterator
  198. continue OUTER
  199. }
  200. if len(actualBMs) == 1 {
  201. // If we've only 1 actual bitmap, then that's our result.
  202. oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(actualBMs[0])
  203. continue OUTER
  204. }
  205. // Else, AND together our collected bitmaps as our result.
  206. bm := roaring.And(actualBMs[0], actualBMs[1])
  207. for _, actualBM := range actualBMs[2:] {
  208. bm.And(actualBM)
  209. }
  210. oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm)
  211. }
  212. return oTFR, nil
  213. }
  214. // ----------------------------------------------------------------
  215. // An "unadorned" disjunction optimization is appropriate when
  216. // additional or subsidiary information like freq-norm's and
  217. // term-vectors are not required, and instead only the internal-id's
  218. // are needed.
  219. func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned(
  220. octx index.OptimizableContext) (index.OptimizableContext, error) {
  221. if octx == nil {
  222. octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot}
  223. }
  224. o, ok := octx.(*OptimizeTFRDisjunctionUnadorned)
  225. if !ok {
  226. return nil, nil
  227. }
  228. if o.snapshot != s.snapshot {
  229. return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots")
  230. }
  231. o.tfrs = append(o.tfrs, s)
  232. return o, nil
  233. }
  234. type OptimizeTFRDisjunctionUnadorned struct {
  235. snapshot *IndexSnapshot
  236. tfrs []*IndexSnapshotTermFieldReader
  237. }
  238. var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>")
  239. var OptimizeTFRDisjunctionUnadornedField = "*"
  240. // Finish of an unadorned disjunction optimization will compute a
  241. // termFieldReader with an "actual" bitmap that represents the
  242. // constituent bitmaps OR'ed together. This termFieldReader cannot
  243. // provide any freq-norm or termVector associated information.
  244. func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) {
  245. if len(o.tfrs) <= 1 {
  246. return nil, nil
  247. }
  248. for i := range o.snapshot.segment {
  249. var cMax uint64
  250. for _, tfr := range o.tfrs {
  251. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  252. if !ok {
  253. return nil, nil
  254. }
  255. if itr.ActualBitmap() != nil {
  256. c := itr.ActualBitmap().GetCardinality()
  257. if cMax < c {
  258. cMax = c
  259. }
  260. }
  261. }
  262. // Heuristic to skip the optimization if all the constituent
  263. // bitmaps are too small, where the processing & resource
  264. // overhead to create the OR'ed bitmap outweighs the benefit.
  265. if cMax < OptimizeDisjunctionUnadornedMinChildCardinality {
  266. return nil, nil
  267. }
  268. }
  269. // We use an artificial term and field because the optimized
  270. // termFieldReader can represent multiple terms and fields.
  271. oTFR := &IndexSnapshotTermFieldReader{
  272. term: OptimizeTFRDisjunctionUnadornedTerm,
  273. field: OptimizeTFRDisjunctionUnadornedField,
  274. snapshot: o.snapshot,
  275. iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
  276. segmentOffset: 0,
  277. includeFreq: false,
  278. includeNorm: false,
  279. includeTermVectors: false,
  280. }
  281. var docNums []uint32 // Collected docNum's from 1-hit posting lists.
  282. var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
  283. for i := range o.snapshot.segment {
  284. docNums = docNums[:0]
  285. actualBMs = actualBMs[:0]
  286. for _, tfr := range o.tfrs {
  287. itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
  288. if !ok {
  289. return nil, nil
  290. }
  291. docNum, ok := itr.DocNum1Hit()
  292. if ok {
  293. docNums = append(docNums, uint32(docNum))
  294. continue
  295. }
  296. if itr.ActualBitmap() != nil {
  297. actualBMs = append(actualBMs, itr.ActualBitmap())
  298. }
  299. }
  300. var bm *roaring.Bitmap
  301. if len(actualBMs) > 2 {
  302. bm = roaring.HeapOr(actualBMs...)
  303. } else if len(actualBMs) == 2 {
  304. bm = roaring.Or(actualBMs[0], actualBMs[1])
  305. } else if len(actualBMs) == 1 {
  306. bm = actualBMs[0].Clone()
  307. }
  308. if bm == nil {
  309. bm = roaring.New()
  310. }
  311. bm.AddMany(docNums)
  312. oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm)
  313. }
  314. return oTFR, nil
  315. }