Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

topn.go 8.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package collector
  15. import (
  16. "context"
  17. "time"
  18. "github.com/blevesearch/bleve/index"
  19. "github.com/blevesearch/bleve/search"
  20. )
  21. type collectorStore interface {
  22. // Add the document, and if the new store size exceeds the provided size
  23. // the last element is removed and returned. If the size has not been
  24. // exceeded, nil is returned.
  25. AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch
  26. Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error)
  27. }
  28. // PreAllocSizeSkipCap will cap preallocation to this amount when
  29. // size+skip exceeds this value
  30. var PreAllocSizeSkipCap = 1000
  31. type collectorCompare func(i, j *search.DocumentMatch) int
  32. type collectorFixup func(d *search.DocumentMatch) error
  33. // TopNCollector collects the top N hits, optionally skipping some results
  34. type TopNCollector struct {
  35. size int
  36. skip int
  37. total uint64
  38. maxScore float64
  39. took time.Duration
  40. sort search.SortOrder
  41. results search.DocumentMatchCollection
  42. facetsBuilder *search.FacetsBuilder
  43. store collectorStore
  44. needDocIds bool
  45. neededFields []string
  46. cachedScoring []bool
  47. cachedDesc []bool
  48. lowestMatchOutsideResults *search.DocumentMatch
  49. }
  50. // CheckDoneEvery controls how frequently we check the context deadline
  51. const CheckDoneEvery = uint64(1024)
  52. // NewTopNCollector builds a collector to find the top 'size' hits
  53. // skipping over the first 'skip' hits
  54. // ordering hits by the provided sort order
  55. func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector {
  56. hc := &TopNCollector{size: size, skip: skip, sort: sort}
  57. // pre-allocate space on the store to avoid reslicing
  58. // unless the size + skip is too large, then cap it
  59. // everything should still work, just reslices as necessary
  60. backingSize := size + skip + 1
  61. if size+skip > PreAllocSizeSkipCap {
  62. backingSize = PreAllocSizeSkipCap + 1
  63. }
  64. if size+skip > 10 {
  65. hc.store = newStoreHeap(backingSize, func(i, j *search.DocumentMatch) int {
  66. return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
  67. })
  68. } else {
  69. hc.store = newStoreSlice(backingSize, func(i, j *search.DocumentMatch) int {
  70. return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
  71. })
  72. }
  73. // these lookups traverse an interface, so do once up-front
  74. if sort.RequiresDocID() {
  75. hc.needDocIds = true
  76. }
  77. hc.neededFields = sort.RequiredFields()
  78. hc.cachedScoring = sort.CacheIsScore()
  79. hc.cachedDesc = sort.CacheDescending()
  80. return hc
  81. }
  82. // Collect goes to the index to find the matching documents
  83. func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
  84. startTime := time.Now()
  85. var err error
  86. var next *search.DocumentMatch
  87. // pre-allocate enough space in the DocumentMatchPool
  88. // unless the size + skip is too large, then cap it
  89. // everything should still work, just allocates DocumentMatches on demand
  90. backingSize := hc.size + hc.skip + 1
  91. if hc.size+hc.skip > PreAllocSizeSkipCap {
  92. backingSize = PreAllocSizeSkipCap + 1
  93. }
  94. searchContext := &search.SearchContext{
  95. DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
  96. }
  97. select {
  98. case <-ctx.Done():
  99. return ctx.Err()
  100. default:
  101. next, err = searcher.Next(searchContext)
  102. }
  103. for err == nil && next != nil {
  104. if hc.total%CheckDoneEvery == 0 {
  105. select {
  106. case <-ctx.Done():
  107. return ctx.Err()
  108. default:
  109. }
  110. }
  111. err = hc.collectSingle(searchContext, reader, next)
  112. if err != nil {
  113. break
  114. }
  115. next, err = searcher.Next(searchContext)
  116. }
  117. // compute search duration
  118. hc.took = time.Since(startTime)
  119. if err != nil {
  120. return err
  121. }
  122. // finalize actual results
  123. err = hc.finalizeResults(reader)
  124. if err != nil {
  125. return err
  126. }
  127. return nil
  128. }
  129. var sortByScoreOpt = []string{"_score"}
  130. func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error {
  131. var err error
  132. // visit field terms for features that require it (sort, facets)
  133. if len(hc.neededFields) > 0 {
  134. err = hc.visitFieldTerms(reader, d)
  135. if err != nil {
  136. return err
  137. }
  138. }
  139. // increment total hits
  140. hc.total++
  141. d.HitNumber = hc.total
  142. // update max score
  143. if d.Score > hc.maxScore {
  144. hc.maxScore = d.Score
  145. }
  146. // see if we need to load ID (at this early stage, for example to sort on it)
  147. if hc.needDocIds {
  148. d.ID, err = reader.ExternalID(d.IndexInternalID)
  149. if err != nil {
  150. return err
  151. }
  152. }
  153. // compute this hits sort value
  154. if len(hc.sort) == 1 && hc.cachedScoring[0] {
  155. d.Sort = sortByScoreOpt
  156. } else {
  157. hc.sort.Value(d)
  158. }
  159. // optimization, we track lowest sorting hit already removed from heap
  160. // with this one comparison, we can avoid all heap operations if
  161. // this hit would have been added and then immediately removed
  162. if hc.lowestMatchOutsideResults != nil {
  163. cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults)
  164. if cmp >= 0 {
  165. // this hit can't possibly be in the result set, so avoid heap ops
  166. ctx.DocumentMatchPool.Put(d)
  167. return nil
  168. }
  169. }
  170. removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip)
  171. if removed != nil {
  172. if hc.lowestMatchOutsideResults == nil {
  173. hc.lowestMatchOutsideResults = removed
  174. } else {
  175. cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults)
  176. if cmp < 0 {
  177. tmp := hc.lowestMatchOutsideResults
  178. hc.lowestMatchOutsideResults = removed
  179. ctx.DocumentMatchPool.Put(tmp)
  180. }
  181. }
  182. }
  183. return nil
  184. }
  185. // visitFieldTerms is responsible for visiting the field terms of the
  186. // search hit, and passing visited terms to the sort and facet builder
  187. func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch) error {
  188. if hc.facetsBuilder != nil {
  189. hc.facetsBuilder.StartDoc()
  190. }
  191. err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) {
  192. if hc.facetsBuilder != nil {
  193. hc.facetsBuilder.UpdateVisitor(field, term)
  194. }
  195. hc.sort.UpdateVisitor(field, term)
  196. })
  197. if hc.facetsBuilder != nil {
  198. hc.facetsBuilder.EndDoc()
  199. }
  200. return err
  201. }
  202. // SetFacetsBuilder registers a facet builder for this collector
  203. func (hc *TopNCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) {
  204. hc.facetsBuilder = facetsBuilder
  205. hc.neededFields = append(hc.neededFields, hc.facetsBuilder.RequiredFields()...)
  206. }
  207. // finalizeResults starts with the heap containing the final top size+skip
  208. // it now throws away the results to be skipped
  209. // and does final doc id lookup (if necessary)
  210. func (hc *TopNCollector) finalizeResults(r index.IndexReader) error {
  211. var err error
  212. hc.results, err = hc.store.Final(hc.skip, func(doc *search.DocumentMatch) error {
  213. if doc.ID == "" {
  214. // look up the id since we need it for lookup
  215. var err error
  216. doc.ID, err = r.ExternalID(doc.IndexInternalID)
  217. if err != nil {
  218. return err
  219. }
  220. }
  221. return nil
  222. })
  223. return err
  224. }
  225. // Results returns the collected hits
  226. func (hc *TopNCollector) Results() search.DocumentMatchCollection {
  227. return hc.results
  228. }
  229. // Total returns the total number of hits
  230. func (hc *TopNCollector) Total() uint64 {
  231. return hc.total
  232. }
  233. // MaxScore returns the maximum score seen across all the hits
  234. func (hc *TopNCollector) MaxScore() float64 {
  235. return hc.maxScore
  236. }
  237. // Took returns the time spent collecting hits
  238. func (hc *TopNCollector) Took() time.Duration {
  239. return hc.took
  240. }
  241. // FacetResults returns the computed facets results
  242. func (hc *TopNCollector) FacetResults() search.FacetResults {
  243. if hc.facetsBuilder != nil {
  244. return hc.facetsBuilder.Results()
  245. }
  246. return search.FacetResults{}
  247. }