diff options
author | Lauris BH <lauris@nix.lv> | 2021-01-18 03:21:14 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-01-17 20:21:14 -0500 |
commit | f5abe2f563e7dbcfd278bb05b8236af9d10a4e8e (patch) | |
tree | f448512c2db0712a9a1eee63e21909c1f6c5762f /vendor/github.com/blevesearch/zapx/v15/segment.go | |
parent | 3aa53dc6bc7e20fa7c894275e27910ea61a59bb3 (diff) | |
download | gitea-f5abe2f563e7dbcfd278bb05b8236af9d10a4e8e.tar.gz gitea-f5abe2f563e7dbcfd278bb05b8236af9d10a4e8e.zip |
Upgrade blevesearch dependency to v2.0.1 (#14346)
* Upgrade blevesearch dependency to v2.0.1
* Update rupture to v1.0.0
* Fix test
Diffstat (limited to 'vendor/github.com/blevesearch/zapx/v15/segment.go')
-rw-r--r-- | vendor/github.com/blevesearch/zapx/v15/segment.go | 570 |
1 files changed, 570 insertions, 0 deletions
diff --git a/vendor/github.com/blevesearch/zapx/v15/segment.go b/vendor/github.com/blevesearch/zapx/v15/segment.go new file mode 100644 index 0000000000..b3ad260f9d --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v15/segment.go @@ -0,0 +1,570 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "os" + "sync" + "unsafe" + + "github.com/RoaringBitmap/roaring" + mmap "github.com/blevesearch/mmap-go" + segment "github.com/blevesearch/scorch_segment_api" + "github.com/couchbase/vellum" + "github.com/golang/snappy" +) + +var reflectStaticSizeSegmentBase int + +func init() { + var sb SegmentBase + reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) +} + +// Open returns a zap impl of a segment +func (*ZapPlugin) Open(path string) (segment.Segment, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + mm, err := mmap.Map(f, mmap.RDONLY, 0) + if err != nil { + // mmap failed, try to close the file + _ = f.Close() + return nil, err + } + + rv := &Segment{ + SegmentBase: SegmentBase{ + mem: mm[0 : len(mm)-FooterSize], + fieldsMap: make(map[string]uint16), + fieldDvReaders: make(map[uint16]*docValueReader), + fieldFSTs: make(map[uint16]*vellum.FST), + }, + f: f, + mm: mm, + path: path, + refs: 1, + } + rv.SegmentBase.updateSize() + + err = rv.loadConfig() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadFields() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadDvReaders() + if err != nil { + _ = rv.Close() + return nil, err + } + + return rv, nil +} + +// SegmentBase is a memory only, read-only implementation of the +// segment.Segment interface, using zap's data representation. +type SegmentBase struct { + mem []byte + memCRC uint32 + chunkMode uint32 + fieldsMap map[string]uint16 // fieldName -> fieldID+1 + fieldsInv []string // fieldID -> fieldName + numDocs uint64 + storedIndexOffset uint64 + fieldsIndexOffset uint64 + docValueOffset uint64 + dictLocs []uint64 + fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field + fieldDvNames []string // field names cached in fieldDvReaders + size uint64 + + m sync.Mutex + fieldFSTs map[uint16]*vellum.FST +} + +func (sb *SegmentBase) Size() int { + return int(sb.size) +} + +func (sb *SegmentBase) updateSize() { + sizeInBytes := reflectStaticSizeSegmentBase + + cap(sb.mem) + + // fieldsMap + for k := range sb.fieldsMap { + sizeInBytes += (len(k) + SizeOfString) + SizeOfUint16 + } + + // fieldsInv, dictLocs + for _, entry := range sb.fieldsInv { + sizeInBytes += len(entry) + SizeOfString + } + sizeInBytes += len(sb.dictLocs) * SizeOfUint64 + + // fieldDvReaders + for _, v := range sb.fieldDvReaders { + sizeInBytes += SizeOfUint16 + SizeOfPtr + if v != nil { + sizeInBytes += v.size() + } + } + + sb.size = uint64(sizeInBytes) +} + +func (sb *SegmentBase) AddRef() {} +func (sb *SegmentBase) DecRef() (err error) { return nil } +func (sb *SegmentBase) Close() (err error) { return nil } + +// Segment implements a persisted segment.Segment interface, by +// embedding an mmap()'ed SegmentBase. +type Segment struct { + SegmentBase + + f *os.File + mm mmap.MMap + path string + version uint32 + crc uint32 + + m sync.Mutex // Protects the fields that follow. + refs int64 +} + +func (s *Segment) Size() int { + // 8 /* size of file pointer */ + // 4 /* size of version -> uint32 */ + // 4 /* size of crc -> uint32 */ + sizeOfUints := 16 + + sizeInBytes := (len(s.path) + SizeOfString) + sizeOfUints + + // mutex, refs -> int64 + sizeInBytes += 16 + + // do not include the mmap'ed part + return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) +} + +func (s *Segment) AddRef() { + s.m.Lock() + s.refs++ + s.m.Unlock() +} + +func (s *Segment) DecRef() (err error) { + s.m.Lock() + s.refs-- + if s.refs == 0 { + err = s.closeActual() + } + s.m.Unlock() + return err +} + +func (s *Segment) loadConfig() error { + crcOffset := len(s.mm) - 4 + s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) + + verOffset := crcOffset - 4 + s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) + if s.version != Version { + return fmt.Errorf("unsupported version %d != %d", s.version, Version) + } + + chunkOffset := verOffset - 4 + s.chunkMode = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) + + docValueOffset := chunkOffset - 8 + s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) + + fieldsIndexOffset := docValueOffset - 8 + s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8]) + + storedIndexOffset := fieldsIndexOffset - 8 + s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8]) + + numDocsOffset := storedIndexOffset - 8 + s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8]) + return nil +} + +func (s *SegmentBase) loadFields() error { + // NOTE for now we assume the fields index immediately precedes + // the footer, and if this changes, need to adjust accordingly (or + // store explicit length), where s.mem was sliced from s.mm in Open(). + fieldsIndexEnd := uint64(len(s.mem)) + + // iterate through fields index + var fieldID uint64 + for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { + addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) + + dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd]) + n := uint64(read) + s.dictLocs = append(s.dictLocs, dictLoc) + + var nameLen uint64 + nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd]) + n += uint64(read) + + name := string(s.mem[addr+n : addr+n+nameLen]) + s.fieldsInv = append(s.fieldsInv, name) + s.fieldsMap[name] = uint16(fieldID + 1) + + fieldID++ + } + return nil +} + +// Dictionary returns the term dictionary for the specified field +func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) { + dict, err := s.dictionary(field) + if err == nil && dict == nil { + return emptyDictionary, nil + } + return dict, err +} + +func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { + fieldIDPlus1 := sb.fieldsMap[field] + if fieldIDPlus1 > 0 { + rv = &Dictionary{ + sb: sb, + field: field, + fieldID: fieldIDPlus1 - 1, + } + + dictStart := sb.dictLocs[rv.fieldID] + if dictStart > 0 { + var ok bool + sb.m.Lock() + if rv.fst, ok = sb.fieldFSTs[rv.fieldID]; !ok { + // read the length of the vellum data + vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64]) + fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] + rv.fst, err = vellum.Load(fstBytes) + if err != nil { + sb.m.Unlock() + return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) + } + + sb.fieldFSTs[rv.fieldID] = rv.fst + } + + sb.m.Unlock() + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) + } + } + } + + return rv, nil +} + +// visitDocumentCtx holds data structures that are reusable across +// multiple VisitDocument() calls to avoid memory allocations +type visitDocumentCtx struct { + buf []byte + reader bytes.Reader + arrayPos []uint64 +} + +var visitDocumentCtxPool = sync.Pool{ + New: func() interface{} { + reuse := &visitDocumentCtx{} + return reuse + }, +} + +// VisitStoredFields invokes the StoredFieldValueVisitor for each stored field +// for the specified doc number +func (s *SegmentBase) VisitStoredFields(num uint64, visitor segment.StoredFieldValueVisitor) error { + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + return s.visitStoredFields(vdc, num, visitor) +} + +func (s *SegmentBase) visitStoredFields(vdc *visitDocumentCtx, num uint64, + visitor segment.StoredFieldValueVisitor) error { + // first make sure this is a valid number in this segment + if num < s.numDocs { + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + idFieldVal := compressed[:idFieldValLen] + + keepGoing := visitor("_id", byte('t'), idFieldVal, nil) + if !keepGoing { + visitDocumentCtxPool.Put(vdc) + return nil + } + + // handle non-"_id" fields + compressed = compressed[idFieldValLen:] + + uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) + if err != nil { + return err + } + + for keepGoing { + field, err := binary.ReadUvarint(&vdc.reader) + if err == io.EOF { + break + } + if err != nil { + return err + } + typ, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + offset, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + l, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + numap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + var arrayPos []uint64 + if numap > 0 { + if cap(vdc.arrayPos) < int(numap) { + vdc.arrayPos = make([]uint64, numap) + } + arrayPos = vdc.arrayPos[:numap] + for i := 0; i < int(numap); i++ { + ap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + arrayPos[i] = ap + } + } + + value := uncompressed[offset : offset+l] + keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) + } + + vdc.buf = uncompressed + } + return nil +} + +// DocID returns the value of the _id field for the given docNum +func (s *SegmentBase) DocID(num uint64) ([]byte, error) { + if num >= s.numDocs { + return nil, nil + } + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return nil, err + } + idFieldVal := compressed[:idFieldValLen] + + visitDocumentCtxPool.Put(vdc) + + return idFieldVal, nil +} + +// Count returns the number of documents in this segment. +func (s *SegmentBase) Count() uint64 { + return s.numDocs +} + +// DocNumbers returns a bitset corresponding to the doc numbers of all the +// provided _id strings +func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { + rv := roaring.New() + + if len(s.fieldsMap) > 0 { + idDict, err := s.dictionary("_id") + if err != nil { + return nil, err + } + + postingsList := emptyPostingsList + + sMax, err := idDict.fst.GetMaxKey() + if err != nil { + return nil, err + } + sMaxStr := string(sMax) + filteredIds := make([]string, 0, len(ids)) + for _, id := range ids { + if id <= sMaxStr { + filteredIds = append(filteredIds, id) + } + } + + for _, id := range filteredIds { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) + if err != nil { + return nil, err + } + postingsList.OrInto(rv) + } + } + + return rv, nil +} + +// Fields returns the field names used in this segment +func (s *SegmentBase) Fields() []string { + return s.fieldsInv +} + +// Path returns the path of this segment on disk +func (s *Segment) Path() string { + return s.path +} + +// Close releases all resources associated with this segment +func (s *Segment) Close() (err error) { + return s.DecRef() +} + +func (s *Segment) closeActual() (err error) { + if s.mm != nil { + err = s.mm.Unmap() + } + // try to close file even if unmap failed + if s.f != nil { + err2 := s.f.Close() + if err == nil { + // try to return first error + err = err2 + } + } + return +} + +// some helpers i started adding for the command-line utility + +// Data returns the underlying mmaped data slice +func (s *Segment) Data() []byte { + return s.mm +} + +// CRC returns the CRC value stored in the file footer +func (s *Segment) CRC() uint32 { + return s.crc +} + +// Version returns the file version in the file footer +func (s *Segment) Version() uint32 { + return s.version +} + +// ChunkFactor returns the chunk factor in the file footer +func (s *Segment) ChunkMode() uint32 { + return s.chunkMode +} + +// FieldsIndexOffset returns the fields index offset in the file footer +func (s *Segment) FieldsIndexOffset() uint64 { + return s.fieldsIndexOffset +} + +// StoredIndexOffset returns the stored value index offset in the file footer +func (s *Segment) StoredIndexOffset() uint64 { + return s.storedIndexOffset +} + +// DocValueOffset returns the docValue offset in the file footer +func (s *Segment) DocValueOffset() uint64 { + return s.docValueOffset +} + +// NumDocs returns the number of documents in the file footer +func (s *Segment) NumDocs() uint64 { + return s.numDocs +} + +// DictAddr is a helper function to compute the file offset where the +// dictionary is stored for the specified field. +func (s *Segment) DictAddr(field string) (uint64, error) { + fieldIDPlus1, ok := s.fieldsMap[field] + if !ok { + return 0, fmt.Errorf("no such field '%s'", field) + } + + return s.dictLocs[fieldIDPlus1-1], nil +} + +func (s *SegmentBase) loadDvReaders() error { + if s.docValueOffset == fieldNotUninverted || s.numDocs == 0 { + return nil + } + + var read uint64 + for fieldID, field := range s.fieldsInv { + var fieldLocStart, fieldLocEnd uint64 + var n int + fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) + } + read += uint64(n) + fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) + } + read += uint64(n) + + fieldDvReader, err := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) + if err != nil { + return err + } + if fieldDvReader != nil { + s.fieldDvReaders[uint16(fieldID)] = fieldDvReader + s.fieldDvNames = append(s.fieldDvNames, field) + } + } + + return nil +} |