123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333 |
- // Copyright (c) 2019 Couchbase, Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
-
- package scorch
-
- import (
- "fmt"
- "io/ioutil"
- "os"
- "sync"
-
- "github.com/RoaringBitmap/roaring"
- index "github.com/blevesearch/bleve_index_api"
- segment "github.com/blevesearch/scorch_segment_api/v2"
- bolt "go.etcd.io/bbolt"
- )
-
- const DefaultBuilderBatchSize = 1000
- const DefaultBuilderMergeMax = 10
-
- type Builder struct {
- m sync.Mutex
- segCount uint64
- path string
- buildPath string
- segPaths []string
- batchSize int
- mergeMax int
- batch *index.Batch
- internal map[string][]byte
- segPlugin SegmentPlugin
- }
-
- func NewBuilder(config map[string]interface{}) (*Builder, error) {
- path, ok := config["path"].(string)
- if !ok {
- return nil, fmt.Errorf("must specify path")
- }
-
- buildPathPrefix, _ := config["buildPathPrefix"].(string)
- buildPath, err := ioutil.TempDir(buildPathPrefix, "scorch-offline-build")
- if err != nil {
- return nil, err
- }
-
- rv := &Builder{
- path: path,
- buildPath: buildPath,
- mergeMax: DefaultBuilderMergeMax,
- batchSize: DefaultBuilderBatchSize,
- batch: index.NewBatch(),
- segPlugin: defaultSegmentPlugin,
- }
-
- err = rv.parseConfig(config)
- if err != nil {
- return nil, fmt.Errorf("error parsing builder config: %v", err)
- }
-
- return rv, nil
- }
-
- func (o *Builder) parseConfig(config map[string]interface{}) (err error) {
- if v, ok := config["mergeMax"]; ok {
- var t int
- if t, err = parseToInteger(v); err != nil {
- return fmt.Errorf("mergeMax parse err: %v", err)
- }
- if t > 0 {
- o.mergeMax = t
- }
- }
-
- if v, ok := config["batchSize"]; ok {
- var t int
- if t, err = parseToInteger(v); err != nil {
- return fmt.Errorf("batchSize parse err: %v", err)
- }
- if t > 0 {
- o.batchSize = t
- }
- }
-
- if v, ok := config["internal"]; ok {
- if vinternal, ok := v.(map[string][]byte); ok {
- o.internal = vinternal
- }
- }
-
- forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config)
- if err != nil {
- return err
- }
- if forcedSegmentType != "" && forcedSegmentVersion != 0 {
- segPlugin, err := chooseSegmentPlugin(forcedSegmentType,
- uint32(forcedSegmentVersion))
- if err != nil {
- return err
- }
- o.segPlugin = segPlugin
- }
-
- return nil
- }
-
- // Index will place the document into the index.
- // It is invalid to index the same document multiple times.
- func (o *Builder) Index(doc index.Document) error {
- o.m.Lock()
- defer o.m.Unlock()
-
- o.batch.Update(doc)
-
- return o.maybeFlushBatchLOCKED(o.batchSize)
- }
-
- func (o *Builder) maybeFlushBatchLOCKED(moreThan int) error {
- if len(o.batch.IndexOps) >= moreThan {
- defer o.batch.Reset()
- return o.executeBatchLOCKED(o.batch)
- }
- return nil
- }
-
- func (o *Builder) executeBatchLOCKED(batch *index.Batch) (err error) {
- analysisResults := make([]index.Document, 0, len(batch.IndexOps))
- for _, doc := range batch.IndexOps {
- if doc != nil {
- // insert _id field
- doc.AddIDField()
- // perform analysis directly
- analyze(doc)
- analysisResults = append(analysisResults, doc)
- }
- }
-
- seg, _, err := o.segPlugin.New(analysisResults)
- if err != nil {
- return fmt.Errorf("error building segment base: %v", err)
- }
-
- filename := zapFileName(o.segCount)
- o.segCount++
- path := o.buildPath + string(os.PathSeparator) + filename
-
- if segUnpersisted, ok := seg.(segment.UnpersistedSegment); ok {
- err = segUnpersisted.Persist(path)
- if err != nil {
- return fmt.Errorf("error persisting segment base to %s: %v", path, err)
- }
-
- o.segPaths = append(o.segPaths, path)
- return nil
- }
-
- return fmt.Errorf("new segment does not implement unpersisted: %T", seg)
- }
-
- func (o *Builder) doMerge() error {
- // as long as we have more than 1 segment, keep merging
- for len(o.segPaths) > 1 {
-
- // merge the next <mergeMax> number of segments into one new one
- // or, if there are fewer than <mergeMax> remaining, merge them all
- mergeCount := o.mergeMax
- if mergeCount > len(o.segPaths) {
- mergeCount = len(o.segPaths)
- }
-
- mergePaths := o.segPaths[0:mergeCount]
- o.segPaths = o.segPaths[mergeCount:]
-
- // open each of the segments to be merged
- mergeSegs := make([]segment.Segment, 0, mergeCount)
-
- // closeOpenedSegs attempts to close all opened
- // segments even if an error occurs, in which case
- // the first error is returned
- closeOpenedSegs := func() error {
- var err error
- for _, seg := range mergeSegs {
- clErr := seg.Close()
- if clErr != nil && err == nil {
- err = clErr
- }
- }
- return err
- }
-
- for _, mergePath := range mergePaths {
- seg, err := o.segPlugin.Open(mergePath)
- if err != nil {
- _ = closeOpenedSegs()
- return fmt.Errorf("error opening segment (%s) for merge: %v", mergePath, err)
- }
- mergeSegs = append(mergeSegs, seg)
- }
-
- // do the merge
- mergedSegPath := o.buildPath + string(os.PathSeparator) + zapFileName(o.segCount)
- drops := make([]*roaring.Bitmap, mergeCount)
- _, _, err := o.segPlugin.Merge(mergeSegs, drops, mergedSegPath, nil, nil)
- if err != nil {
- _ = closeOpenedSegs()
- return fmt.Errorf("error merging segments (%v): %v", mergePaths, err)
- }
- o.segCount++
- o.segPaths = append(o.segPaths, mergedSegPath)
-
- // close segments opened for merge
- err = closeOpenedSegs()
- if err != nil {
- return fmt.Errorf("error closing opened segments: %v", err)
- }
-
- // remove merged segments
- for _, mergePath := range mergePaths {
- err = os.RemoveAll(mergePath)
- if err != nil {
- return fmt.Errorf("error removing segment %s after merge: %v", mergePath, err)
- }
- }
- }
-
- return nil
- }
-
- func (o *Builder) Close() error {
- o.m.Lock()
- defer o.m.Unlock()
-
- // see if there is a partial batch
- err := o.maybeFlushBatchLOCKED(1)
- if err != nil {
- return fmt.Errorf("error flushing batch before close: %v", err)
- }
-
- // perform all the merging
- err = o.doMerge()
- if err != nil {
- return fmt.Errorf("error while merging: %v", err)
- }
-
- // ensure the store path exists
- err = os.MkdirAll(o.path, 0700)
- if err != nil {
- return err
- }
-
- // move final segment into place
- // segment id 2 is chosen to match the behavior of a scorch
- // index which indexes a single batch of data
- finalSegPath := o.path + string(os.PathSeparator) + zapFileName(2)
- err = os.Rename(o.segPaths[0], finalSegPath)
- if err != nil {
- return fmt.Errorf("error moving final segment into place: %v", err)
- }
-
- // remove the buildPath, as it is no longer needed
- err = os.RemoveAll(o.buildPath)
- if err != nil {
- return fmt.Errorf("error removing build path: %v", err)
- }
-
- // prepare wrapping
- seg, err := o.segPlugin.Open(finalSegPath)
- if err != nil {
- return fmt.Errorf("error opening final segment")
- }
-
- // create a segment snapshot for this segment
- ss := &SegmentSnapshot{
- segment: seg,
- }
- is := &IndexSnapshot{
- epoch: 3, // chosen to match scorch behavior when indexing a single batch
- segment: []*SegmentSnapshot{ss},
- creator: "scorch-builder",
- internal: o.internal,
- }
-
- // create the root bolt
- rootBoltPath := o.path + string(os.PathSeparator) + "root.bolt"
- rootBolt, err := bolt.Open(rootBoltPath, 0600, nil)
- if err != nil {
- return err
- }
-
- // start a write transaction
- tx, err := rootBolt.Begin(true)
- if err != nil {
- return err
- }
-
- // fill the root bolt with this fake index snapshot
- _, _, err = prepareBoltSnapshot(is, tx, o.path, o.segPlugin)
- if err != nil {
- _ = tx.Rollback()
- _ = rootBolt.Close()
- return fmt.Errorf("error preparing bolt snapshot in root.bolt: %v", err)
- }
-
- // commit bolt data
- err = tx.Commit()
- if err != nil {
- _ = rootBolt.Close()
- return fmt.Errorf("error committing bolt tx in root.bolt: %v", err)
- }
-
- // close bolt
- err = rootBolt.Close()
- if err != nil {
- return fmt.Errorf("error closing root.bolt: %v", err)
- }
-
- // close final segment
- err = seg.Close()
- if err != nil {
- return fmt.Errorf("error closing final segment: %v", err)
- }
- return nil
- }
|