You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

builder.go 8.1KB


  1. // Copyright (c) 2019 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package scorch
  15. import (
  16. "fmt"
  17. "io/ioutil"
  18. "os"
  19. "sync"
  20. "github.com/RoaringBitmap/roaring"
  21. index "github.com/blevesearch/bleve_index_api"
  22. segment "github.com/blevesearch/scorch_segment_api/v2"
  23. bolt "go.etcd.io/bbolt"
  24. )
  25. const DefaultBuilderBatchSize = 1000
  26. const DefaultBuilderMergeMax = 10
  27. type Builder struct {
  28. m sync.Mutex
  29. segCount uint64
  30. path string
  31. buildPath string
  32. segPaths []string
  33. batchSize int
  34. mergeMax int
  35. batch *index.Batch
  36. internal map[string][]byte
  37. segPlugin SegmentPlugin
  38. }
  39. func NewBuilder(config map[string]interface{}) (*Builder, error) {
  40. path, ok := config["path"].(string)
  41. if !ok {
  42. return nil, fmt.Errorf("must specify path")
  43. }
  44. buildPathPrefix, _ := config["buildPathPrefix"].(string)
  45. buildPath, err := ioutil.TempDir(buildPathPrefix, "scorch-offline-build")
  46. if err != nil {
  47. return nil, err
  48. }
  49. rv := &Builder{
  50. path: path,
  51. buildPath: buildPath,
  52. mergeMax: DefaultBuilderMergeMax,
  53. batchSize: DefaultBuilderBatchSize,
  54. batch: index.NewBatch(),
  55. segPlugin: defaultSegmentPlugin,
  56. }
  57. err = rv.parseConfig(config)
  58. if err != nil {
  59. return nil, fmt.Errorf("error parsing builder config: %v", err)
  60. }
  61. return rv, nil
  62. }
  63. func (o *Builder) parseConfig(config map[string]interface{}) (err error) {
  64. if v, ok := config["mergeMax"]; ok {
  65. var t int
  66. if t, err = parseToInteger(v); err != nil {
  67. return fmt.Errorf("mergeMax parse err: %v", err)
  68. }
  69. if t > 0 {
  70. o.mergeMax = t
  71. }
  72. }
  73. if v, ok := config["batchSize"]; ok {
  74. var t int
  75. if t, err = parseToInteger(v); err != nil {
  76. return fmt.Errorf("batchSize parse err: %v", err)
  77. }
  78. if t > 0 {
  79. o.batchSize = t
  80. }
  81. }
  82. if v, ok := config["internal"]; ok {
  83. if vinternal, ok := v.(map[string][]byte); ok {
  84. o.internal = vinternal
  85. }
  86. }
  87. forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config)
  88. if err != nil {
  89. return err
  90. }
  91. if forcedSegmentType != "" && forcedSegmentVersion != 0 {
  92. segPlugin, err := chooseSegmentPlugin(forcedSegmentType,
  93. uint32(forcedSegmentVersion))
  94. if err != nil {
  95. return err
  96. }
  97. o.segPlugin = segPlugin
  98. }
  99. return nil
  100. }
  101. // Index will place the document into the index.
  102. // It is invalid to index the same document multiple times.
  103. func (o *Builder) Index(doc index.Document) error {
  104. o.m.Lock()
  105. defer o.m.Unlock()
  106. o.batch.Update(doc)
  107. return o.maybeFlushBatchLOCKED(o.batchSize)
  108. }
  109. func (o *Builder) maybeFlushBatchLOCKED(moreThan int) error {
  110. if len(o.batch.IndexOps) >= moreThan {
  111. defer o.batch.Reset()
  112. return o.executeBatchLOCKED(o.batch)
  113. }
  114. return nil
  115. }
  116. func (o *Builder) executeBatchLOCKED(batch *index.Batch) (err error) {
  117. analysisResults := make([]index.Document, 0, len(batch.IndexOps))
  118. for _, doc := range batch.IndexOps {
  119. if doc != nil {
  120. // insert _id field
  121. doc.AddIDField()
  122. // perform analysis directly
  123. analyze(doc)
  124. analysisResults = append(analysisResults, doc)
  125. }
  126. }
  127. seg, _, err := o.segPlugin.New(analysisResults)
  128. if err != nil {
  129. return fmt.Errorf("error building segment base: %v", err)
  130. }
  131. filename := zapFileName(o.segCount)
  132. o.segCount++
  133. path := o.buildPath + string(os.PathSeparator) + filename
  134. if segUnpersisted, ok := seg.(segment.UnpersistedSegment); ok {
  135. err = segUnpersisted.Persist(path)
  136. if err != nil {
  137. return fmt.Errorf("error persisting segment base to %s: %v", path, err)
  138. }
  139. o.segPaths = append(o.segPaths, path)
  140. return nil
  141. }
  142. return fmt.Errorf("new segment does not implement unpersisted: %T", seg)
  143. }
  144. func (o *Builder) doMerge() error {
  145. // as long as we have more than 1 segment, keep merging
  146. for len(o.segPaths) > 1 {
  147. // merge the next <mergeMax> number of segments into one new one
  148. // or, if there are fewer than <mergeMax> remaining, merge them all
  149. mergeCount := o.mergeMax
  150. if mergeCount > len(o.segPaths) {
  151. mergeCount = len(o.segPaths)
  152. }
  153. mergePaths := o.segPaths[0:mergeCount]
  154. o.segPaths = o.segPaths[mergeCount:]
  155. // open each of the segments to be merged
  156. mergeSegs := make([]segment.Segment, 0, mergeCount)
  157. // closeOpenedSegs attempts to close all opened
  158. // segments even if an error occurs, in which case
  159. // the first error is returned
  160. closeOpenedSegs := func() error {
  161. var err error
  162. for _, seg := range mergeSegs {
  163. clErr := seg.Close()
  164. if clErr != nil && err == nil {
  165. err = clErr
  166. }
  167. }
  168. return err
  169. }
  170. for _, mergePath := range mergePaths {
  171. seg, err := o.segPlugin.Open(mergePath)
  172. if err != nil {
  173. _ = closeOpenedSegs()
  174. return fmt.Errorf("error opening segment (%s) for merge: %v", mergePath, err)
  175. }
  176. mergeSegs = append(mergeSegs, seg)
  177. }
  178. // do the merge
  179. mergedSegPath := o.buildPath + string(os.PathSeparator) + zapFileName(o.segCount)
  180. drops := make([]*roaring.Bitmap, mergeCount)
  181. _, _, err := o.segPlugin.Merge(mergeSegs, drops, mergedSegPath, nil, nil)
  182. if err != nil {
  183. _ = closeOpenedSegs()
  184. return fmt.Errorf("error merging segments (%v): %v", mergePaths, err)
  185. }
  186. o.segCount++
  187. o.segPaths = append(o.segPaths, mergedSegPath)
  188. // close segments opened for merge
  189. err = closeOpenedSegs()
  190. if err != nil {
  191. return fmt.Errorf("error closing opened segments: %v", err)
  192. }
  193. // remove merged segments
  194. for _, mergePath := range mergePaths {
  195. err = os.RemoveAll(mergePath)
  196. if err != nil {
  197. return fmt.Errorf("error removing segment %s after merge: %v", mergePath, err)
  198. }
  199. }
  200. }
  201. return nil
  202. }
  203. func (o *Builder) Close() error {
  204. o.m.Lock()
  205. defer o.m.Unlock()
  206. // see if there is a partial batch
  207. err := o.maybeFlushBatchLOCKED(1)
  208. if err != nil {
  209. return fmt.Errorf("error flushing batch before close: %v", err)
  210. }
  211. // perform all the merging
  212. err = o.doMerge()
  213. if err != nil {
  214. return fmt.Errorf("error while merging: %v", err)
  215. }
  216. // ensure the store path exists
  217. err = os.MkdirAll(o.path, 0700)
  218. if err != nil {
  219. return err
  220. }
  221. // move final segment into place
  222. // segment id 2 is chosen to match the behavior of a scorch
  223. // index which indexes a single batch of data
  224. finalSegPath := o.path + string(os.PathSeparator) + zapFileName(2)
  225. err = os.Rename(o.segPaths[0], finalSegPath)
  226. if err != nil {
  227. return fmt.Errorf("error moving final segment into place: %v", err)
  228. }
  229. // remove the buildPath, as it is no longer needed
  230. err = os.RemoveAll(o.buildPath)
  231. if err != nil {
  232. return fmt.Errorf("error removing build path: %v", err)
  233. }
  234. // prepare wrapping
  235. seg, err := o.segPlugin.Open(finalSegPath)
  236. if err != nil {
  237. return fmt.Errorf("error opening final segment")
  238. }
  239. // create a segment snapshot for this segment
  240. ss := &SegmentSnapshot{
  241. segment: seg,
  242. }
  243. is := &IndexSnapshot{
  244. epoch: 3, // chosen to match scorch behavior when indexing a single batch
  245. segment: []*SegmentSnapshot{ss},
  246. creator: "scorch-builder",
  247. internal: o.internal,
  248. }
  249. // create the root bolt
  250. rootBoltPath := o.path + string(os.PathSeparator) + "root.bolt"
  251. rootBolt, err := bolt.Open(rootBoltPath, 0600, nil)
  252. if err != nil {
  253. return err
  254. }
  255. // start a write transaction
  256. tx, err := rootBolt.Begin(true)
  257. if err != nil {
  258. return err
  259. }
  260. // fill the root bolt with this fake index snapshot
  261. _, _, err = prepareBoltSnapshot(is, tx, o.path, o.segPlugin)
  262. if err != nil {
  263. _ = tx.Rollback()
  264. _ = rootBolt.Close()
  265. return fmt.Errorf("error preparing bolt snapshot in root.bolt: %v", err)
  266. }
  267. // commit bolt data
  268. err = tx.Commit()
  269. if err != nil {
  270. _ = rootBolt.Close()
  271. return fmt.Errorf("error committing bolt tx in root.bolt: %v", err)
  272. }
  273. // close bolt
  274. err = rootBolt.Close()
  275. if err != nil {
  276. return fmt.Errorf("error closing root.bolt: %v", err)
  277. }
  278. // close final segment
  279. err = seg.Close()
  280. if err != nil {
  281. return fmt.Errorf("error closing final segment: %v", err)
  282. }
  283. return nil
  284. }