You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

topn.go 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package collector
  15. import (
  16. "context"
  17. "reflect"
  18. "strconv"
  19. "time"
  20. "github.com/blevesearch/bleve/index"
  21. "github.com/blevesearch/bleve/search"
  22. "github.com/blevesearch/bleve/size"
  23. )
  24. var reflectStaticSizeTopNCollector int
  25. func init() {
  26. var coll TopNCollector
  27. reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size())
  28. }
  29. type collectorStore interface {
  30. // Add the document, and if the new store size exceeds the provided size
  31. // the last element is removed and returned. If the size has not been
  32. // exceeded, nil is returned.
  33. AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch
  34. Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error)
  35. }
  36. // PreAllocSizeSkipCap will cap preallocation to this amount when
  37. // size+skip exceeds this value
  38. var PreAllocSizeSkipCap = 1000
  39. type collectorCompare func(i, j *search.DocumentMatch) int
  40. type collectorFixup func(d *search.DocumentMatch) error
  41. // TopNCollector collects the top N hits, optionally skipping some results
  42. type TopNCollector struct {
  43. size int
  44. skip int
  45. total uint64
  46. maxScore float64
  47. took time.Duration
  48. sort search.SortOrder
  49. results search.DocumentMatchCollection
  50. facetsBuilder *search.FacetsBuilder
  51. store collectorStore
  52. needDocIds bool
  53. neededFields []string
  54. cachedScoring []bool
  55. cachedDesc []bool
  56. lowestMatchOutsideResults *search.DocumentMatch
  57. updateFieldVisitor index.DocumentFieldTermVisitor
  58. dvReader index.DocValueReader
  59. searchAfter *search.DocumentMatch
  60. }
  61. // CheckDoneEvery controls how frequently we check the context deadline
  62. const CheckDoneEvery = uint64(1024)
  63. // NewTopNCollector builds a collector to find the top 'size' hits
  64. // skipping over the first 'skip' hits
  65. // ordering hits by the provided sort order
  66. func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector {
  67. return newTopNCollector(size, skip, sort)
  68. }
  69. // NewTopNCollector builds a collector to find the top 'size' hits
  70. // skipping over the first 'skip' hits
  71. // ordering hits by the provided sort order
  72. func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *TopNCollector {
  73. rv := newTopNCollector(size, 0, sort)
  74. rv.searchAfter = &search.DocumentMatch{
  75. Sort: after,
  76. }
  77. for pos, ss := range sort {
  78. if ss.RequiresDocID() {
  79. rv.searchAfter.ID = after[pos]
  80. }
  81. if ss.RequiresScoring() {
  82. if score, err := strconv.ParseFloat(after[pos], 64); err == nil {
  83. rv.searchAfter.Score = score
  84. }
  85. }
  86. }
  87. return rv
  88. }
  89. func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector {
  90. hc := &TopNCollector{size: size, skip: skip, sort: sort}
  91. // pre-allocate space on the store to avoid reslicing
  92. // unless the size + skip is too large, then cap it
  93. // everything should still work, just reslices as necessary
  94. backingSize := size + skip + 1
  95. if size+skip > PreAllocSizeSkipCap {
  96. backingSize = PreAllocSizeSkipCap + 1
  97. }
  98. if size+skip > 10 {
  99. hc.store = newStoreHeap(backingSize, func(i, j *search.DocumentMatch) int {
  100. return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
  101. })
  102. } else {
  103. hc.store = newStoreSlice(backingSize, func(i, j *search.DocumentMatch) int {
  104. return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
  105. })
  106. }
  107. // these lookups traverse an interface, so do once up-front
  108. if sort.RequiresDocID() {
  109. hc.needDocIds = true
  110. }
  111. hc.neededFields = sort.RequiredFields()
  112. hc.cachedScoring = sort.CacheIsScore()
  113. hc.cachedDesc = sort.CacheDescending()
  114. return hc
  115. }
  116. func (hc *TopNCollector) Size() int {
  117. sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr
  118. if hc.facetsBuilder != nil {
  119. sizeInBytes += hc.facetsBuilder.Size()
  120. }
  121. for _, entry := range hc.neededFields {
  122. sizeInBytes += len(entry) + size.SizeOfString
  123. }
  124. sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc)
  125. return sizeInBytes
  126. }
  127. // Collect goes to the index to find the matching documents
  128. func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
  129. startTime := time.Now()
  130. var err error
  131. var next *search.DocumentMatch
  132. // pre-allocate enough space in the DocumentMatchPool
  133. // unless the size + skip is too large, then cap it
  134. // everything should still work, just allocates DocumentMatches on demand
  135. backingSize := hc.size + hc.skip + 1
  136. if hc.size+hc.skip > PreAllocSizeSkipCap {
  137. backingSize = PreAllocSizeSkipCap + 1
  138. }
  139. searchContext := &search.SearchContext{
  140. DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
  141. Collector: hc,
  142. IndexReader: reader,
  143. }
  144. hc.dvReader, err = reader.DocValueReader(hc.neededFields)
  145. if err != nil {
  146. return err
  147. }
  148. hc.updateFieldVisitor = func(field string, term []byte) {
  149. if hc.facetsBuilder != nil {
  150. hc.facetsBuilder.UpdateVisitor(field, term)
  151. }
  152. hc.sort.UpdateVisitor(field, term)
  153. }
  154. dmHandlerMaker := MakeTopNDocumentMatchHandler
  155. if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil {
  156. dmHandlerMaker = cv.(search.MakeDocumentMatchHandler)
  157. }
  158. // use the application given builder for making the custom document match
  159. // handler and perform callbacks/invocations on the newly made handler.
  160. dmHandler, loadID, err := dmHandlerMaker(searchContext)
  161. if err != nil {
  162. return err
  163. }
  164. hc.needDocIds = hc.needDocIds || loadID
  165. select {
  166. case <-ctx.Done():
  167. return ctx.Err()
  168. default:
  169. next, err = searcher.Next(searchContext)
  170. }
  171. for err == nil && next != nil {
  172. if hc.total%CheckDoneEvery == 0 {
  173. select {
  174. case <-ctx.Done():
  175. return ctx.Err()
  176. default:
  177. }
  178. }
  179. err = hc.prepareDocumentMatch(searchContext, reader, next)
  180. if err != nil {
  181. break
  182. }
  183. err = dmHandler(next)
  184. if err != nil {
  185. break
  186. }
  187. next, err = searcher.Next(searchContext)
  188. }
  189. // help finalize/flush the results in case
  190. // of custom document match handlers.
  191. err = dmHandler(nil)
  192. if err != nil {
  193. return err
  194. }
  195. // compute search duration
  196. hc.took = time.Since(startTime)
  197. if err != nil {
  198. return err
  199. }
  200. // finalize actual results
  201. err = hc.finalizeResults(reader)
  202. if err != nil {
  203. return err
  204. }
  205. return nil
  206. }
  207. var sortByScoreOpt = []string{"_score"}
  208. func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext,
  209. reader index.IndexReader, d *search.DocumentMatch) (err error) {
  210. // visit field terms for features that require it (sort, facets)
  211. if len(hc.neededFields) > 0 {
  212. err = hc.visitFieldTerms(reader, d)
  213. if err != nil {
  214. return err
  215. }
  216. }
  217. // increment total hits
  218. hc.total++
  219. d.HitNumber = hc.total
  220. // update max score
  221. if d.Score > hc.maxScore {
  222. hc.maxScore = d.Score
  223. }
  224. // see if we need to load ID (at this early stage, for example to sort on it)
  225. if hc.needDocIds {
  226. d.ID, err = reader.ExternalID(d.IndexInternalID)
  227. if err != nil {
  228. return err
  229. }
  230. }
  231. // compute this hits sort value
  232. if len(hc.sort) == 1 && hc.cachedScoring[0] {
  233. d.Sort = sortByScoreOpt
  234. } else {
  235. hc.sort.Value(d)
  236. }
  237. return nil
  238. }
  239. func MakeTopNDocumentMatchHandler(
  240. ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) {
  241. var hc *TopNCollector
  242. var ok bool
  243. if hc, ok = ctx.Collector.(*TopNCollector); ok {
  244. return func(d *search.DocumentMatch) error {
  245. if d == nil {
  246. return nil
  247. }
  248. // support search after based pagination,
  249. // if this hit is <= the search after sort key
  250. // we should skip it
  251. if hc.searchAfter != nil {
  252. // exact sort order matches use hit number to break tie
  253. // but we want to allow for exact match, so we pretend
  254. hc.searchAfter.HitNumber = d.HitNumber
  255. if hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.searchAfter) <= 0 {
  256. return nil
  257. }
  258. }
  259. // optimization, we track lowest sorting hit already removed from heap
  260. // with this one comparison, we can avoid all heap operations if
  261. // this hit would have been added and then immediately removed
  262. if hc.lowestMatchOutsideResults != nil {
  263. cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d,
  264. hc.lowestMatchOutsideResults)
  265. if cmp >= 0 {
  266. // this hit can't possibly be in the result set, so avoid heap ops
  267. ctx.DocumentMatchPool.Put(d)
  268. return nil
  269. }
  270. }
  271. removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip)
  272. if removed != nil {
  273. if hc.lowestMatchOutsideResults == nil {
  274. hc.lowestMatchOutsideResults = removed
  275. } else {
  276. cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc,
  277. removed, hc.lowestMatchOutsideResults)
  278. if cmp < 0 {
  279. tmp := hc.lowestMatchOutsideResults
  280. hc.lowestMatchOutsideResults = removed
  281. ctx.DocumentMatchPool.Put(tmp)
  282. }
  283. }
  284. }
  285. return nil
  286. }, false, nil
  287. }
  288. return nil, false, nil
  289. }
  290. // visitFieldTerms is responsible for visiting the field terms of the
  291. // search hit, and passing visited terms to the sort and facet builder
  292. func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch) error {
  293. if hc.facetsBuilder != nil {
  294. hc.facetsBuilder.StartDoc()
  295. }
  296. err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor)
  297. if hc.facetsBuilder != nil {
  298. hc.facetsBuilder.EndDoc()
  299. }
  300. return err
  301. }
  302. // SetFacetsBuilder registers a facet builder for this collector
  303. func (hc *TopNCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) {
  304. hc.facetsBuilder = facetsBuilder
  305. hc.neededFields = append(hc.neededFields, hc.facetsBuilder.RequiredFields()...)
  306. }
  307. // finalizeResults starts with the heap containing the final top size+skip
  308. // it now throws away the results to be skipped
  309. // and does final doc id lookup (if necessary)
  310. func (hc *TopNCollector) finalizeResults(r index.IndexReader) error {
  311. var err error
  312. hc.results, err = hc.store.Final(hc.skip, func(doc *search.DocumentMatch) error {
  313. if doc.ID == "" {
  314. // look up the id since we need it for lookup
  315. var err error
  316. doc.ID, err = r.ExternalID(doc.IndexInternalID)
  317. if err != nil {
  318. return err
  319. }
  320. }
  321. doc.Complete(nil)
  322. return nil
  323. })
  324. return err
  325. }
  326. // Results returns the collected hits
  327. func (hc *TopNCollector) Results() search.DocumentMatchCollection {
  328. return hc.results
  329. }
  330. // Total returns the total number of hits
  331. func (hc *TopNCollector) Total() uint64 {
  332. return hc.total
  333. }
  334. // MaxScore returns the maximum score seen across all the hits
  335. func (hc *TopNCollector) MaxScore() float64 {
  336. return hc.maxScore
  337. }
  338. // Took returns the time spent collecting hits
  339. func (hc *TopNCollector) Took() time.Duration {
  340. return hc.took
  341. }
  342. // FacetResults returns the computed facets results
  343. func (hc *TopNCollector) FacetResults() search.FacetResults {
  344. if hc.facetsBuilder != nil {
  345. return hc.facetsBuilder.Results()
  346. }
  347. return nil
  348. }