* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2 * remove unused pkg from dep file * change bleve from master to recent revisiontags/v1.9.0-dev
@@ -40,14 +40,6 @@ | |||
revision = "1a28a7fa985680f9f4e1644c0a857ec359a444b0" | |||
version = "v0.4.7" | |||
[[projects]] | |||
branch = "master" | |||
digest = "1:93367b6d47a8ccc7d14f9f493ccf103ccf5afb698559ff8e8f1999427ce27ace" | |||
name = "github.com/Smerity/govarint" | |||
packages = ["."] | |||
pruneopts = "NUT" | |||
revision = "7265e41f48f15fd61751e16da866af3c704bb3ab" | |||
[[projects]] | |||
branch = "master" | |||
digest = "1:d290f4b25abbf574f80f60c8a5603ddada784f13f436b91a9a927bc7ce5a0146" | |||
@@ -98,7 +90,8 @@ | |||
revision = "3a771d992973f24aa725d07868b467d1ddfceafb" | |||
[[projects]] | |||
digest = "1:c10f35be6200b09e26da267ca80f837315093ecaba27e7a223071380efb9dd32" | |||
branch = "master" | |||
digest = "1:b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2" | |||
name = "github.com/blevesearch/bleve" | |||
packages = [ | |||
".", | |||
@@ -121,7 +114,6 @@ | |||
"index/scorch", | |||
"index/scorch/mergeplan", | |||
"index/scorch/segment", | |||
"index/scorch/segment/mem", | |||
"index/scorch/segment/zap", | |||
"index/store", | |||
"index/store/boltdb", | |||
@@ -141,9 +133,10 @@ | |||
"search/query", | |||
"search/scorer", | |||
"search/searcher", | |||
"size", | |||
] | |||
pruneopts = "NUT" | |||
revision = "c74e08f039e56cef576e4336382b2a2d12d9e026" | |||
revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5" | |||
[[projects]] | |||
branch = "master" | |||
@@ -160,14 +153,6 @@ | |||
pruneopts = "NUT" | |||
revision = "db70c57796cc8c310613541dfade3dce627d09c7" | |||
[[projects]] | |||
digest = "1:c7e0968c05659f3973148cd5c5387d6ee960a6ae1b2eaaec0b1d435d806458bb" | |||
name = "github.com/boltdb/bolt" | |||
packages = ["."] | |||
pruneopts = "NUT" | |||
revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7" | |||
source = "github.com/go-gitea/bolt" | |||
[[projects]] | |||
digest = "1:7c96cf7bf7f52af67f7a8222185813b9b665f5172ec2ac5f7d49ed96e5fcf3e5" | |||
name = "github.com/boombuler/barcode" | |||
@@ -217,15 +202,16 @@ | |||
[[projects]] | |||
branch = "master" | |||
digest = "1:82e1ad11d777f7bff9a1fc678a8a534a318f85e5026a8a4d6f4a94a6b0678bb6" | |||
digest = "1:6a658ac7d23204dc743c7155557c45273747d78e05ae0579742bd6b744bce215" | |||
name = "github.com/couchbase/vellum" | |||
packages = [ | |||
".", | |||
"levenshtein2", | |||
"regexp", | |||
"utf8", | |||
] | |||
pruneopts = "NUT" | |||
revision = "eb6ae3743b3f300f2136f83ca78c08cc071edbd4" | |||
revision = "e91b68ff3efe3cc11723aa25dd315cbc9276cd65" | |||
[[projects]] | |||
branch = "master" | |||
@@ -287,6 +273,14 @@ | |||
revision = "1615341f118ae12f353cc8a983f35b584342c9b3" | |||
version = "v1.12.0" | |||
[[projects]] | |||
digest = "1:ae8eea1a24ae43a46c2e96631b6303fcc4210ca0ac9d643e4da965029d1b511d" | |||
name = "github.com/etcd-io/bbolt" | |||
packages = ["."] | |||
pruneopts = "NUT" | |||
revision = "63597a96ec0ad9e6d43c3fc81e809909e0237461" | |||
version = "v1.3.2" | |||
[[projects]] | |||
digest = "1:8603f74d35c93b37c615a02ba297be2cf2efc9ff6f1ff2b458a903990b568e48" | |||
name = "github.com/ethantkoenig/rupture" |
@@ -15,10 +15,8 @@ ignored = ["google.golang.org/appengine*"] | |||
name = "code.gitea.io/sdk" | |||
[[constraint]] | |||
# branch = "master" | |||
revision = "c74e08f039e56cef576e4336382b2a2d12d9e026" | |||
revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5" | |||
name = "github.com/blevesearch/bleve" | |||
#Not targetting v0.7.0 since standard where use only just after this tag | |||
[[constraint]] | |||
revision = "12dd70caea0268ac0d6c2707d0611ef601e7c64e" | |||
@@ -108,11 +106,6 @@ ignored = ["google.golang.org/appengine*"] | |||
name = "gopkg.in/testfixtures.v2" | |||
version = "2.0.0" | |||
[[override]] | |||
name = "github.com/boltdb/bolt" | |||
revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7" | |||
source = "github.com/go-gitea/bolt" | |||
[[override]] | |||
branch = "master" | |||
name = "golang.org/x/oauth2" |
@@ -1,22 +0,0 @@ | |||
The MIT License (MIT) | |||
Copyright (c) 2015 Stephen Merity | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
in the Software without restriction, including without limitation the rights | |||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
copies of the Software, and to permit persons to whom the Software is | |||
furnished to do so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in all | |||
copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
SOFTWARE. | |||
@@ -1,229 +0,0 @@ | |||
package govarint | |||
import "encoding/binary" | |||
import "io" | |||
type U32VarintEncoder interface { | |||
PutU32(x uint32) int | |||
Close() | |||
} | |||
type U32VarintDecoder interface { | |||
GetU32() (uint32, error) | |||
} | |||
/// | |||
type U64VarintEncoder interface { | |||
PutU64(x uint64) int | |||
Close() | |||
} | |||
type U64VarintDecoder interface { | |||
GetU64() (uint64, error) | |||
} | |||
/// | |||
type U32GroupVarintEncoder struct { | |||
w io.Writer | |||
index int | |||
store [4]uint32 | |||
temp [17]byte | |||
} | |||
func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} } | |||
func (b *U32GroupVarintEncoder) Flush() (int, error) { | |||
// TODO: Is it more efficient to have a tailored version that's called only in Close()? | |||
// If index is zero, there are no integers to flush | |||
if b.index == 0 { | |||
return 0, nil | |||
} | |||
// In the case we're flushing (the group isn't of size four), the non-values should be zero | |||
// This ensures the unused entries are all zero in the sizeByte | |||
for i := b.index; i < 4; i++ { | |||
b.store[i] = 0 | |||
} | |||
length := 1 | |||
// We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it | |||
b.temp[0] = 0 | |||
for i, x := range b.store { | |||
size := byte(0) | |||
shifts := []byte{24, 16, 8, 0} | |||
for _, shift := range shifts { | |||
// Always writes at least one byte -- the first one (shift = 0) | |||
// Will write more bytes until the rest of the integer is all zeroes | |||
if (x>>shift) != 0 || shift == 0 { | |||
size += 1 | |||
b.temp[length] = byte(x >> shift) | |||
length += 1 | |||
} | |||
} | |||
// We store the size in two of the eight bits in the first byte (sizeByte) | |||
// 0 means there is one byte in total, hence why we subtract one from size | |||
b.temp[0] |= (size - 1) << (uint8(3-i) * 2) | |||
} | |||
// If we're flushing without a full group of four, remove the unused bytes we computed | |||
// This enables us to realize it's a partial group on decoding thanks to EOF | |||
if b.index != 4 { | |||
length -= 4 - b.index | |||
} | |||
_, err := b.w.Write(b.temp[:length]) | |||
return length, err | |||
} | |||
func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) { | |||
bytesWritten := 0 | |||
b.store[b.index] = x | |||
b.index += 1 | |||
if b.index == 4 { | |||
n, err := b.Flush() | |||
if err != nil { | |||
return n, err | |||
} | |||
bytesWritten += n | |||
b.index = 0 | |||
} | |||
return bytesWritten, nil | |||
} | |||
func (b *U32GroupVarintEncoder) Close() { | |||
// On Close, we flush any remaining values that might not have been in a full group | |||
b.Flush() | |||
} | |||
/// | |||
type U32GroupVarintDecoder struct { | |||
r io.ByteReader | |||
group [4]uint32 | |||
pos int | |||
finished bool | |||
capacity int | |||
} | |||
func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder { | |||
return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4} | |||
} | |||
func (b *U32GroupVarintDecoder) getGroup() error { | |||
// We should always receive a sizeByte if there are more values to read | |||
sizeByte, err := b.r.ReadByte() | |||
if err != nil { | |||
return err | |||
} | |||
// Calculate the size of the four incoming 32 bit integers | |||
// 0b00 means 1 byte to read, 0b01 = 2, etc | |||
b.group[0] = uint32((sizeByte >> 6) & 3) | |||
b.group[1] = uint32((sizeByte >> 4) & 3) | |||
b.group[2] = uint32((sizeByte >> 2) & 3) | |||
b.group[3] = uint32(sizeByte & 3) | |||
// | |||
for index, size := range b.group { | |||
b.group[index] = 0 | |||
// Any error that occurs in earlier byte reads should be repeated at the end one | |||
// Hence we only catch and report the final ReadByte's error | |||
var err error | |||
switch size { | |||
case 0: | |||
var x byte | |||
x, err = b.r.ReadByte() | |||
b.group[index] = uint32(x) | |||
case 1: | |||
var x, y byte | |||
x, _ = b.r.ReadByte() | |||
y, err = b.r.ReadByte() | |||
b.group[index] = uint32(x)<<8 | uint32(y) | |||
case 2: | |||
var x, y, z byte | |||
x, _ = b.r.ReadByte() | |||
y, _ = b.r.ReadByte() | |||
z, err = b.r.ReadByte() | |||
b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z) | |||
case 3: | |||
var x, y, z, zz byte | |||
x, _ = b.r.ReadByte() | |||
y, _ = b.r.ReadByte() | |||
z, _ = b.r.ReadByte() | |||
zz, err = b.r.ReadByte() | |||
b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz) | |||
} | |||
if err != nil { | |||
if err == io.EOF { | |||
// If we hit EOF here, we have found a partial group | |||
// We've return any valid entries we have read and return EOF once we run out | |||
b.capacity = index | |||
b.finished = true | |||
break | |||
} else { | |||
return err | |||
} | |||
} | |||
} | |||
// Reset the pos pointer to the beginning of the read values | |||
b.pos = 0 | |||
return nil | |||
} | |||
func (b *U32GroupVarintDecoder) GetU32() (uint32, error) { | |||
// Check if we have any more values to give out - if not, let's get them | |||
if b.pos == b.capacity { | |||
// If finished is set, there is nothing else to do | |||
if b.finished { | |||
return 0, io.EOF | |||
} | |||
err := b.getGroup() | |||
if err != nil { | |||
return 0, err | |||
} | |||
} | |||
// Increment pointer and return the value stored at that point | |||
b.pos += 1 | |||
return b.group[b.pos-1], nil | |||
} | |||
/// | |||
type Base128Encoder struct { | |||
w io.Writer | |||
tmpBytes []byte | |||
} | |||
func NewU32Base128Encoder(w io.Writer) *Base128Encoder { | |||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)} | |||
} | |||
func NewU64Base128Encoder(w io.Writer) *Base128Encoder { | |||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)} | |||
} | |||
func (b *Base128Encoder) PutU32(x uint32) (int, error) { | |||
writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x)) | |||
return b.w.Write(b.tmpBytes[:writtenBytes]) | |||
} | |||
func (b *Base128Encoder) PutU64(x uint64) (int, error) { | |||
writtenBytes := binary.PutUvarint(b.tmpBytes, x) | |||
return b.w.Write(b.tmpBytes[:writtenBytes]) | |||
} | |||
func (b *Base128Encoder) Close() { | |||
} | |||
/// | |||
type Base128Decoder struct { | |||
r io.ByteReader | |||
} | |||
func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } | |||
func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } | |||
func (b *Base128Decoder) GetU32() (uint32, error) { | |||
v, err := binary.ReadUvarint(b.r) | |||
return uint32(v), err | |||
} | |||
func (b *Base128Decoder) GetU64() (uint64, error) { | |||
return binary.ReadUvarint(b.r) | |||
} |
@@ -14,6 +14,22 @@ | |||
package analysis | |||
import ( | |||
"reflect" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeTokenLocation int | |||
var reflectStaticSizeTokenFreq int | |||
func init() { | |||
var tl TokenLocation | |||
reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size()) | |||
var tf TokenFreq | |||
reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size()) | |||
} | |||
// TokenLocation represents one occurrence of a term at a particular location in | |||
// a field. Start, End and Position have the same meaning as in analysis.Token. | |||
// Field and ArrayPositions identify the field value in the source document. | |||
@@ -26,6 +42,12 @@ type TokenLocation struct { | |||
Position int | |||
} | |||
func (tl *TokenLocation) Size() int { | |||
rv := reflectStaticSizeTokenLocation | |||
rv += len(tl.ArrayPositions) * size.SizeOfUint64 | |||
return rv | |||
} | |||
// TokenFreq represents all the occurrences of a term in all fields of a | |||
// document. | |||
type TokenFreq struct { | |||
@@ -34,6 +56,15 @@ type TokenFreq struct { | |||
frequency int | |||
} | |||
func (tf *TokenFreq) Size() int { | |||
rv := reflectStaticSizeTokenFreq | |||
rv += len(tf.Term) | |||
for _, loc := range tf.Locations { | |||
rv += loc.Size() | |||
} | |||
return rv | |||
} | |||
func (tf *TokenFreq) Frequency() int { | |||
return tf.frequency | |||
} | |||
@@ -42,6 +73,16 @@ func (tf *TokenFreq) Frequency() int { | |||
// fields. | |||
type TokenFrequencies map[string]*TokenFreq | |||
func (tfs TokenFrequencies) Size() int { | |||
rv := size.SizeOfMap | |||
rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr) | |||
for k, v := range tfs { | |||
rv += len(k) | |||
rv += v.Size() | |||
} | |||
return rv | |||
} | |||
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) { | |||
// walk the new token frequencies | |||
for tfk, tf := range other { |
@@ -46,11 +46,11 @@ type Parser struct { | |||
index int | |||
} | |||
func NewParser(len, position, index int) *Parser { | |||
func NewParser(length, position, index int) *Parser { | |||
return &Parser{ | |||
bufferLen: len, | |||
buffer: make([]rune, 0, len), | |||
tokens: make([]*analysis.Token, 0, len), | |||
bufferLen: length, | |||
buffer: make([]rune, 0, length), | |||
tokens: make([]*analysis.Token, 0, length), | |||
position: position, | |||
index: index, | |||
} |
@@ -21,7 +21,7 @@ import ( | |||
const Name = "unique" | |||
// UniqueTermFilter retains only the tokens which mark the first occurence of | |||
// UniqueTermFilter retains only the tokens which mark the first occurrence of | |||
// a term. Tokens whose term appears in a preceding token are dropped. | |||
type UniqueTermFilter struct{} | |||
@@ -14,7 +14,19 @@ | |||
package document | |||
import "fmt" | |||
import ( | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeDocument int | |||
func init() { | |||
var d Document | |||
reflectStaticSizeDocument = int(reflect.TypeOf(d).Size()) | |||
} | |||
type Document struct { | |||
ID string `json:"id"` | |||
@@ -30,6 +42,21 @@ func NewDocument(id string) *Document { | |||
} | |||
} | |||
func (d *Document) Size() int { | |||
sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr + | |||
len(d.ID) | |||
for _, entry := range d.Fields { | |||
sizeInBytes += entry.Size() | |||
} | |||
for _, entry := range d.CompositeFields { | |||
sizeInBytes += entry.Size() | |||
} | |||
return sizeInBytes | |||
} | |||
func (d *Document) AddField(f Field) *Document { | |||
switch f := f.(type) { | |||
case *CompositeField: |
@@ -36,4 +36,6 @@ type Field interface { | |||
// that this field represents - this is a common metric for tracking | |||
// the rate of indexing | |||
NumPlainTextBytes() uint64 | |||
Size() int | |||
} |
@@ -16,10 +16,19 @@ package document | |||
import ( | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeBooleanField int | |||
func init() { | |||
var f BooleanField | |||
reflectStaticSizeBooleanField = int(reflect.TypeOf(f).Size()) | |||
} | |||
const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues | |||
type BooleanField struct { | |||
@@ -30,6 +39,13 @@ type BooleanField struct { | |||
numPlainTextBytes uint64 | |||
} | |||
func (b *BooleanField) Size() int { | |||
return reflectStaticSizeBooleanField + size.SizeOfPtr + | |||
len(b.name) + | |||
len(b.arrayPositions)*size.SizeOfUint64 + | |||
len(b.value) | |||
} | |||
func (b *BooleanField) Name() string { | |||
return b.name | |||
} |
@@ -15,9 +15,19 @@ | |||
package document | |||
import ( | |||
"reflect" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeCompositeField int | |||
func init() { | |||
var cf CompositeField | |||
reflectStaticSizeCompositeField = int(reflect.TypeOf(cf).Size()) | |||
} | |||
const DefaultCompositeIndexingOptions = IndexField | |||
type CompositeField struct { | |||
@@ -54,6 +64,21 @@ func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, incl | |||
return rv | |||
} | |||
func (c *CompositeField) Size() int { | |||
sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr + | |||
len(c.name) | |||
for k, _ := range c.includedFields { | |||
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool | |||
} | |||
for k, _ := range c.excludedFields { | |||
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool | |||
} | |||
return sizeInBytes | |||
} | |||
func (c *CompositeField) Name() string { | |||
return c.name | |||
} |
@@ -17,12 +17,21 @@ package document | |||
import ( | |||
"fmt" | |||
"math" | |||
"reflect" | |||
"time" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/numeric" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeDateTimeField int | |||
func init() { | |||
var f DateTimeField | |||
reflectStaticSizeDateTimeField = int(reflect.TypeOf(f).Size()) | |||
} | |||
const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues | |||
const DefaultDateTimePrecisionStep uint = 4 | |||
@@ -37,6 +46,12 @@ type DateTimeField struct { | |||
numPlainTextBytes uint64 | |||
} | |||
func (n *DateTimeField) Size() int { | |||
return reflectStaticSizeDateTimeField + size.SizeOfPtr + | |||
len(n.name) + | |||
len(n.arrayPositions)*size.SizeOfUint64 | |||
} | |||
func (n *DateTimeField) Name() string { | |||
return n.name | |||
} |
@@ -16,12 +16,21 @@ package document | |||
import ( | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/geo" | |||
"github.com/blevesearch/bleve/numeric" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeGeoPointField int | |||
func init() { | |||
var f GeoPointField | |||
reflectStaticSizeGeoPointField = int(reflect.TypeOf(f).Size()) | |||
} | |||
var GeoPrecisionStep uint = 9 | |||
type GeoPointField struct { | |||
@@ -32,6 +41,12 @@ type GeoPointField struct { | |||
numPlainTextBytes uint64 | |||
} | |||
func (n *GeoPointField) Size() int { | |||
return reflectStaticSizeGeoPointField + size.SizeOfPtr + | |||
len(n.name) + | |||
len(n.arrayPositions)*size.SizeOfUint64 | |||
} | |||
func (n *GeoPointField) Name() string { | |||
return n.name | |||
} |
@@ -16,11 +16,20 @@ package document | |||
import ( | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/numeric" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeNumericField int | |||
func init() { | |||
var f NumericField | |||
reflectStaticSizeNumericField = int(reflect.TypeOf(f).Size()) | |||
} | |||
const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues | |||
const DefaultPrecisionStep uint = 4 | |||
@@ -33,6 +42,12 @@ type NumericField struct { | |||
numPlainTextBytes uint64 | |||
} | |||
func (n *NumericField) Size() int { | |||
return reflectStaticSizeNumericField + size.SizeOfPtr + | |||
len(n.name) + | |||
len(n.arrayPositions)*size.SizeOfPtr | |||
} | |||
func (n *NumericField) Name() string { | |||
return n.name | |||
} |
@@ -16,10 +16,19 @@ package document | |||
import ( | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeTextField int | |||
func init() { | |||
var f TextField | |||
reflectStaticSizeTextField = int(reflect.TypeOf(f).Size()) | |||
} | |||
const DefaultTextIndexingOptions = IndexField | DocValues | |||
type TextField struct { | |||
@@ -31,6 +40,13 @@ type TextField struct { | |||
numPlainTextBytes uint64 | |||
} | |||
func (t *TextField) Size() int { | |||
return reflectStaticSizeTextField + size.SizeOfPtr + | |||
len(t.name) + | |||
len(t.arrayPositions)*size.SizeOfUint64 + | |||
len(t.value) | |||
} | |||
func (t *TextField) Name() string { | |||
return t.name | |||
} |
@@ -0,0 +1,174 @@ | |||
// The code here was obtained from: | |||
// https://github.com/mmcloughlin/geohash | |||
// The MIT License (MIT) | |||
// Copyright (c) 2015 Michael McLoughlin | |||
// Permission is hereby granted, free of charge, to any person obtaining a copy | |||
// of this software and associated documentation files (the "Software"), to deal | |||
// in the Software without restriction, including without limitation the rights | |||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
// copies of the Software, and to permit persons to whom the Software is | |||
// furnished to do so, subject to the following conditions: | |||
// The above copyright notice and this permission notice shall be included in all | |||
// copies or substantial portions of the Software. | |||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
// SOFTWARE. | |||
package geo | |||
import ( | |||
"math" | |||
) | |||
// encoding encapsulates an encoding defined by a given base32 alphabet. | |||
type encoding struct { | |||
enc string | |||
dec [256]byte | |||
} | |||
// newEncoding constructs a new encoding defined by the given alphabet, | |||
// which must be a 32-byte string. | |||
func newEncoding(encoder string) *encoding { | |||
e := new(encoding) | |||
e.enc = encoder | |||
for i := 0; i < len(e.dec); i++ { | |||
e.dec[i] = 0xff | |||
} | |||
for i := 0; i < len(encoder); i++ { | |||
e.dec[encoder[i]] = byte(i) | |||
} | |||
return e | |||
} | |||
// Decode string into bits of a 64-bit word. The string s may be at most 12 | |||
// characters. | |||
func (e *encoding) decode(s string) uint64 { | |||
x := uint64(0) | |||
for i := 0; i < len(s); i++ { | |||
x = (x << 5) | uint64(e.dec[s[i]]) | |||
} | |||
return x | |||
} | |||
// Encode bits of 64-bit word into a string. | |||
func (e *encoding) encode(x uint64) string { | |||
b := [12]byte{} | |||
for i := 0; i < 12; i++ { | |||
b[11-i] = e.enc[x&0x1f] | |||
x >>= 5 | |||
} | |||
return string(b[:]) | |||
} | |||
// Base32Encoding with the Geohash alphabet. | |||
var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz") | |||
// BoundingBox returns the region encoded by the given string geohash. | |||
func geoBoundingBox(hash string) geoBox { | |||
bits := uint(5 * len(hash)) | |||
inthash := base32encoding.decode(hash) | |||
return geoBoundingBoxIntWithPrecision(inthash, bits) | |||
} | |||
// Box represents a rectangle in latitude/longitude space. | |||
type geoBox struct { | |||
minLat float64 | |||
maxLat float64 | |||
minLng float64 | |||
maxLng float64 | |||
} | |||
// Round returns a point inside the box, making an effort to round to minimal | |||
// precision. | |||
func (b geoBox) round() (lat, lng float64) { | |||
x := maxDecimalPower(b.maxLat - b.minLat) | |||
lat = math.Ceil(b.minLat/x) * x | |||
x = maxDecimalPower(b.maxLng - b.minLng) | |||
lng = math.Ceil(b.minLng/x) * x | |||
return | |||
} | |||
// precalculated for performance | |||
var exp232 = math.Exp2(32) | |||
// errorWithPrecision returns the error range in latitude and longitude for in | |||
// integer geohash with bits of precision. | |||
func errorWithPrecision(bits uint) (latErr, lngErr float64) { | |||
b := int(bits) | |||
latBits := b / 2 | |||
lngBits := b - latBits | |||
latErr = math.Ldexp(180.0, -latBits) | |||
lngErr = math.Ldexp(360.0, -lngBits) | |||
return | |||
} | |||
// minDecimalPlaces returns the minimum number of decimal places such that | |||
// there must exist an number with that many places within any range of width | |||
// r. This is intended for returning minimal precision coordinates inside a | |||
// box. | |||
func maxDecimalPower(r float64) float64 { | |||
m := int(math.Floor(math.Log10(r))) | |||
return math.Pow10(m) | |||
} | |||
// Encode the position of x within the range -r to +r as a 32-bit integer. | |||
func encodeRange(x, r float64) uint32 { | |||
p := (x + r) / (2 * r) | |||
return uint32(p * exp232) | |||
} | |||
// Decode the 32-bit range encoding X back to a value in the range -r to +r. | |||
func decodeRange(X uint32, r float64) float64 { | |||
p := float64(X) / exp232 | |||
x := 2*r*p - r | |||
return x | |||
} | |||
// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are | |||
// ignored, and may take any value. | |||
func squash(X uint64) uint32 { | |||
X &= 0x5555555555555555 | |||
X = (X | (X >> 1)) & 0x3333333333333333 | |||
X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f | |||
X = (X | (X >> 4)) & 0x00ff00ff00ff00ff | |||
X = (X | (X >> 8)) & 0x0000ffff0000ffff | |||
X = (X | (X >> 16)) & 0x00000000ffffffff | |||
return uint32(X) | |||
} | |||
// Deinterleave the bits of X into 32-bit words containing the even and odd | |||
// bitlevels of X, respectively. | |||
func deinterleave(X uint64) (uint32, uint32) { | |||
return squash(X), squash(X >> 1) | |||
} | |||
// BoundingBoxIntWithPrecision returns the region encoded by the integer | |||
// geohash with the specified precision. | |||
func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox { | |||
fullHash := hash << (64 - bits) | |||
latInt, lngInt := deinterleave(fullHash) | |||
lat := decodeRange(latInt, 90) | |||
lng := decodeRange(lngInt, 180) | |||
latErr, lngErr := errorWithPrecision(bits) | |||
return geoBox{ | |||
minLat: lat, | |||
maxLat: lat + latErr, | |||
minLng: lng, | |||
maxLng: lng + lngErr, | |||
} | |||
} | |||
// ---------------------------------------------------------------------- | |||
// Decode the string geohash to a (lat, lng) point. | |||
func GeoHashDecode(hash string) (lat, lng float64) { | |||
box := geoBoundingBox(hash) | |||
return box.round() | |||
} |
@@ -16,6 +16,7 @@ package geo | |||
import ( | |||
"reflect" | |||
"strconv" | |||
"strings" | |||
) | |||
@@ -24,6 +25,8 @@ import ( | |||
// Container: | |||
// slice length 2 (GeoJSON) | |||
// first element lon, second element lat | |||
// string (coordinates separated by comma, or a geohash) | |||
// first element lat, second element lon | |||
// map[string]interface{} | |||
// exact keys lat and lon or lng | |||
// struct | |||
@@ -36,10 +39,14 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { | |||
var foundLon, foundLat bool | |||
thingVal := reflect.ValueOf(thing) | |||
if !thingVal.IsValid() { | |||
return lon, lat, false | |||
} | |||
thingTyp := thingVal.Type() | |||
// is it a slice | |||
if thingVal.IsValid() && thingVal.Kind() == reflect.Slice { | |||
if thingVal.Kind() == reflect.Slice { | |||
// must be length 2 | |||
if thingVal.Len() == 2 { | |||
first := thingVal.Index(0) | |||
@@ -55,6 +62,35 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { | |||
} | |||
} | |||
// is it a string | |||
if thingVal.Kind() == reflect.String { | |||
geoStr := thingVal.Interface().(string) | |||
if strings.Contains(geoStr, ",") { | |||
// geo point with coordinates split by comma | |||
points := strings.Split(geoStr, ",") | |||
for i, point := range points { | |||
// trim any leading or trailing white spaces | |||
points[i] = strings.TrimSpace(point) | |||
} | |||
if len(points) == 2 { | |||
var err error | |||
lat, err = strconv.ParseFloat(points[0], 64) | |||
if err == nil { | |||
foundLat = true | |||
} | |||
lon, err = strconv.ParseFloat(points[1], 64) | |||
if err == nil { | |||
foundLon = true | |||
} | |||
} | |||
} else { | |||
// geohash | |||
lat, lon = GeoHashDecode(geoStr) | |||
foundLat = true | |||
foundLon = true | |||
} | |||
} | |||
// is it a map | |||
if l, ok := thing.(map[string]interface{}); ok { | |||
if lval, ok := l["lon"]; ok { | |||
@@ -68,7 +104,7 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { | |||
} | |||
// now try reflection on struct fields | |||
if thingVal.IsValid() && thingVal.Kind() == reflect.Struct { | |||
if thingVal.Kind() == reflect.Struct { | |||
for i := 0; i < thingVal.NumField(); i++ { | |||
fieldName := thingTyp.Field(i).Name | |||
if strings.HasPrefix(strings.ToLower(fieldName), "lon") { | |||
@@ -113,6 +149,9 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { | |||
// extract numeric value (if possible) and returns a float64 | |||
func extractNumericVal(v interface{}) (float64, bool) { | |||
val := reflect.ValueOf(v) | |||
if !val.IsValid() { | |||
return 0, false | |||
} | |||
typ := val.Type() | |||
switch typ.Kind() { | |||
case reflect.Float32, reflect.Float64: |
@@ -21,6 +21,7 @@ import ( | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/store" | |||
"github.com/blevesearch/bleve/mapping" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
// A Batch groups together multiple Index and Delete | |||
@@ -32,6 +33,9 @@ import ( | |||
type Batch struct { | |||
index Index | |||
internal *index.Batch | |||
lastDocSize uint64 | |||
totalSize uint64 | |||
} | |||
// Index adds the specified index operation to the | |||
@@ -47,9 +51,22 @@ func (b *Batch) Index(id string, data interface{}) error { | |||
return err | |||
} | |||
b.internal.Update(doc) | |||
b.lastDocSize = uint64(doc.Size() + | |||
len(id) + size.SizeOfString) // overhead from internal | |||
b.totalSize += b.lastDocSize | |||
return nil | |||
} | |||
func (b *Batch) LastDocSize() uint64 { | |||
return b.lastDocSize | |||
} | |||
func (b *Batch) TotalDocsSize() uint64 { | |||
return b.totalSize | |||
} | |||
// IndexAdvanced adds the specified index operation to the | |||
// batch which skips the mapping. NOTE: the bleve Index is not updated | |||
// until the batch is executed. | |||
@@ -102,6 +119,24 @@ func (b *Batch) Reset() { | |||
b.internal.Reset() | |||
} | |||
func (b *Batch) Merge(o *Batch) { | |||
if o != nil && o.internal != nil { | |||
b.internal.Merge(o.internal) | |||
if o.LastDocSize() > 0 { | |||
b.lastDocSize = o.LastDocSize() | |||
} | |||
b.totalSize = uint64(b.internal.TotalDocSize()) | |||
} | |||
} | |||
func (b *Batch) SetPersistedCallback(f index.BatchCallback) { | |||
b.internal.SetPersistedCallback(f) | |||
} | |||
func (b *Batch) PersistedCallback() index.BatchCallback { | |||
return b.internal.PersistedCallback() | |||
} | |||
// An Index implements all the indexing and searching | |||
// capabilities of bleve. An Index can be created | |||
// using the New() and Open() methods. |
@@ -15,10 +15,20 @@ | |||
package index | |||
import ( | |||
"reflect" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeAnalysisResult int | |||
func init() { | |||
var ar AnalysisResult | |||
reflectStaticSizeAnalysisResult = int(reflect.TypeOf(ar).Size()) | |||
} | |||
type IndexRow interface { | |||
KeySize() int | |||
KeyTo([]byte) (int, error) | |||
@@ -39,6 +49,15 @@ type AnalysisResult struct { | |||
Length []int | |||
} | |||
func (a *AnalysisResult) Size() int { | |||
rv := reflectStaticSizeAnalysisResult | |||
for _, analyzedI := range a.Analyzed { | |||
rv += analyzedI.Size() | |||
} | |||
rv += len(a.Length) * size.SizeOfInt | |||
return rv | |||
} | |||
type AnalysisWork struct { | |||
i Index | |||
d *document.Document |
@@ -18,11 +18,23 @@ import ( | |||
"bytes" | |||
"encoding/json" | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/index/store" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeTermFieldDoc int | |||
var reflectStaticSizeTermFieldVector int | |||
func init() { | |||
var tfd TermFieldDoc | |||
reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size()) | |||
var tfv TermFieldVector | |||
reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size()) | |||
} | |||
var ErrorUnknownStorageType = fmt.Errorf("unknown storage type") | |||
type Index interface { | |||
@@ -68,6 +80,8 @@ type IndexReader interface { | |||
Document(id string) (*document.Document, error) | |||
DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error | |||
DocValueReader(fields []string) (DocValueReader, error) | |||
Fields() ([]string, error) | |||
GetInternal(key []byte) ([]byte, error) | |||
@@ -84,6 +98,29 @@ type IndexReader interface { | |||
Close() error | |||
} | |||
// The Regexp interface defines the subset of the regexp.Regexp API | |||
// methods that are used by bleve indexes, allowing callers to pass in | |||
// alternate implementations. | |||
type Regexp interface { | |||
FindStringIndex(s string) (loc []int) | |||
LiteralPrefix() (prefix string, complete bool) | |||
String() string | |||
} | |||
type IndexReaderRegexp interface { | |||
FieldDictRegexp(field string, regex string) (FieldDict, error) | |||
} | |||
type IndexReaderFuzzy interface { | |||
FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error) | |||
} | |||
type IndexReaderOnly interface { | |||
FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error) | |||
} | |||
// FieldTerms contains the terms used by a document, keyed by field | |||
type FieldTerms map[string][]string | |||
@@ -115,6 +152,11 @@ type TermFieldVector struct { | |||
End uint64 | |||
} | |||
func (tfv *TermFieldVector) Size() int { | |||
return reflectStaticSizeTermFieldVector + size.SizeOfPtr + | |||
len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64 | |||
} | |||
// IndexInternalID is an opaque document identifier interal to the index impl | |||
type IndexInternalID []byte | |||
@@ -134,14 +176,27 @@ type TermFieldDoc struct { | |||
Vectors []*TermFieldVector | |||
} | |||
func (tfd *TermFieldDoc) Size() int { | |||
sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr + | |||
len(tfd.Term) + len(tfd.ID) | |||
for _, entry := range tfd.Vectors { | |||
sizeInBytes += entry.Size() | |||
} | |||
return sizeInBytes | |||
} | |||
// Reset allows an already allocated TermFieldDoc to be reused | |||
func (tfd *TermFieldDoc) Reset() *TermFieldDoc { | |||
// remember the []byte used for the ID | |||
id := tfd.ID | |||
vectors := tfd.Vectors | |||
// idiom to copy over from empty TermFieldDoc (0 allocations) | |||
*tfd = TermFieldDoc{} | |||
// reuse the []byte already allocated (and reset len to 0) | |||
tfd.ID = id[:0] | |||
tfd.Vectors = vectors[:0] | |||
return tfd | |||
} | |||
@@ -161,6 +216,8 @@ type TermFieldReader interface { | |||
// Count returns the number of documents contains the term in this field. | |||
Count() uint64 | |||
Close() error | |||
Size() int | |||
} | |||
type DictEntry struct { | |||
@@ -185,12 +242,18 @@ type DocIDReader interface { | |||
// will start there instead. If ID is greater than or equal to the end of | |||
// the range, Next() call will return io.EOF. | |||
Advance(ID IndexInternalID) (IndexInternalID, error) | |||
Size() int | |||
Close() error | |||
} | |||
type BatchCallback func(error) | |||
type Batch struct { | |||
IndexOps map[string]*document.Document | |||
InternalOps map[string][]byte | |||
IndexOps map[string]*document.Document | |||
InternalOps map[string][]byte | |||
persistedCallback BatchCallback | |||
} | |||
func NewBatch() *Batch { | |||
@@ -216,6 +279,14 @@ func (b *Batch) DeleteInternal(key []byte) { | |||
b.InternalOps[string(key)] = nil | |||
} | |||
func (b *Batch) SetPersistedCallback(f BatchCallback) { | |||
b.persistedCallback = f | |||
} | |||
func (b *Batch) PersistedCallback() BatchCallback { | |||
return b.persistedCallback | |||
} | |||
func (b *Batch) String() string { | |||
rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps)) | |||
for k, v := range b.IndexOps { | |||
@@ -238,4 +309,53 @@ func (b *Batch) String() string { | |||
func (b *Batch) Reset() { | |||
b.IndexOps = make(map[string]*document.Document) | |||
b.InternalOps = make(map[string][]byte) | |||
b.persistedCallback = nil | |||
} | |||
func (b *Batch) Merge(o *Batch) { | |||
for k, v := range o.IndexOps { | |||
b.IndexOps[k] = v | |||
} | |||
for k, v := range o.InternalOps { | |||
b.InternalOps[k] = v | |||
} | |||
} | |||
func (b *Batch) TotalDocSize() int { | |||
var s int | |||
for k, v := range b.IndexOps { | |||
if v != nil { | |||
s += v.Size() + size.SizeOfString | |||
} | |||
s += len(k) | |||
} | |||
return s | |||
} | |||
// Optimizable represents an optional interface that implementable by | |||
// optimizable resources (e.g., TermFieldReaders, Searchers). These | |||
// optimizable resources are provided the same OptimizableContext | |||
// instance, so that they can coordinate via dynamic interface | |||
// casting. | |||
type Optimizable interface { | |||
Optimize(kind string, octx OptimizableContext) (OptimizableContext, error) | |||
} | |||
// Represents a result of optimization -- see the Finish() method. | |||
type Optimized interface{} | |||
type OptimizableContext interface { | |||
// Once all the optimzable resources have been provided the same | |||
// OptimizableContext instance, the optimization preparations are | |||
// finished or completed via the Finish() method. | |||
// | |||
// Depending on the optimization being performed, the Finish() | |||
// method might return a non-nil Optimized instance. For example, | |||
// the Optimized instance might represent an optimized | |||
// TermFieldReader instance. | |||
Finish() (Optimized, error) | |||
} | |||
type DocValueReader interface { | |||
VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error | |||
} |
@@ -19,7 +19,9 @@ import ( | |||
"sync/atomic" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/blevesearch/bleve/index/scorch/segment/zap" | |||
) | |||
type segmentIntroduction struct { | |||
@@ -29,8 +31,14 @@ type segmentIntroduction struct { | |||
ids []string | |||
internal map[string][]byte | |||
applied chan error | |||
persisted chan error | |||
applied chan error | |||
persisted chan error | |||
persistedCallback index.BatchCallback | |||
} | |||
type persistIntroduction struct { | |||
persisted map[uint64]segment.Segment | |||
applied notificationChan | |||
} | |||
type epochWatcher struct { | |||
@@ -48,6 +56,8 @@ func (s *Scorch) mainLoop() { | |||
var epochWatchers []*epochWatcher | |||
OUTER: | |||
for { | |||
atomic.AddUint64(&s.stats.TotIntroduceLoop, 1) | |||
select { | |||
case <-s.closeCh: | |||
break OUTER | |||
@@ -64,6 +74,9 @@ OUTER: | |||
continue OUTER | |||
} | |||
case persist := <-s.persists: | |||
s.introducePersist(persist) | |||
case revertTo := <-s.revertToSnapshots: | |||
err := s.revertToSnapshot(revertTo) | |||
if err != nil { | |||
@@ -92,32 +105,38 @@ OUTER: | |||
} | |||
func (s *Scorch) introduceSegment(next *segmentIntroduction) error { | |||
// acquire lock | |||
s.rootLock.Lock() | |||
atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1) | |||
defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1) | |||
s.rootLock.RLock() | |||
root := s.root | |||
root.AddRef() | |||
s.rootLock.RUnlock() | |||
defer func() { _ = root.DecRef() }() | |||
nsegs := len(s.root.segment) | |||
nsegs := len(root.segment) | |||
// prepare new index snapshot | |||
newSnapshot := &IndexSnapshot{ | |||
parent: s, | |||
segment: make([]*SegmentSnapshot, 0, nsegs+1), | |||
offsets: make([]uint64, 0, nsegs+1), | |||
internal: make(map[string][]byte, len(s.root.internal)), | |||
epoch: s.nextSnapshotEpoch, | |||
internal: make(map[string][]byte, len(root.internal)), | |||
refs: 1, | |||
creator: "introduceSegment", | |||
} | |||
s.nextSnapshotEpoch++ | |||
// iterate through current segments | |||
var running uint64 | |||
for i := range s.root.segment { | |||
var docsToPersistCount, memSegments, fileSegments uint64 | |||
for i := range root.segment { | |||
// see if optimistic work included this segment | |||
delta, ok := next.obsoletes[s.root.segment[i].id] | |||
delta, ok := next.obsoletes[root.segment[i].id] | |||
if !ok { | |||
var err error | |||
delta, err = s.root.segment[i].segment.DocNumbers(next.ids) | |||
delta, err = root.segment[i].segment.DocNumbers(next.ids) | |||
if err != nil { | |||
s.rootLock.Unlock() | |||
next.applied <- fmt.Errorf("error computing doc numbers: %v", err) | |||
close(next.applied) | |||
_ = newSnapshot.DecRef() | |||
@@ -126,43 +145,60 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { | |||
} | |||
newss := &SegmentSnapshot{ | |||
id: s.root.segment[i].id, | |||
segment: s.root.segment[i].segment, | |||
cachedDocs: s.root.segment[i].cachedDocs, | |||
id: root.segment[i].id, | |||
segment: root.segment[i].segment, | |||
cachedDocs: root.segment[i].cachedDocs, | |||
creator: root.segment[i].creator, | |||
} | |||
// apply new obsoletions | |||
if s.root.segment[i].deleted == nil { | |||
if root.segment[i].deleted == nil { | |||
newss.deleted = delta | |||
} else { | |||
newss.deleted = roaring.Or(s.root.segment[i].deleted, delta) | |||
newss.deleted = roaring.Or(root.segment[i].deleted, delta) | |||
} | |||
if newss.deleted.IsEmpty() { | |||
newss.deleted = nil | |||
} | |||
// check for live size before copying | |||
if newss.LiveSize() > 0 { | |||
newSnapshot.segment = append(newSnapshot.segment, newss) | |||
s.root.segment[i].segment.AddRef() | |||
root.segment[i].segment.AddRef() | |||
newSnapshot.offsets = append(newSnapshot.offsets, running) | |||
running += s.root.segment[i].Count() | |||
running += newss.segment.Count() | |||
} | |||
if isMemorySegment(root.segment[i]) { | |||
docsToPersistCount += root.segment[i].Count() | |||
memSegments++ | |||
} else { | |||
fileSegments++ | |||
} | |||
} | |||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) | |||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) | |||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) | |||
// append new segment, if any, to end of the new index snapshot | |||
if next.data != nil { | |||
newSegmentSnapshot := &SegmentSnapshot{ | |||
id: next.id, | |||
segment: next.data, // take ownership of next.data's ref-count | |||
cachedDocs: &cachedDocs{cache: nil}, | |||
creator: "introduceSegment", | |||
} | |||
newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot) | |||
newSnapshot.offsets = append(newSnapshot.offsets, running) | |||
// increment numItemsIntroduced which tracks the number of items | |||
// queued for persistence. | |||
atomic.AddUint64(&s.stats.numItemsIntroduced, newSegmentSnapshot.Count()) | |||
atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count()) | |||
atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1) | |||
} | |||
// copy old values | |||
for key, oldVal := range s.root.internal { | |||
for key, oldVal := range root.internal { | |||
newSnapshot.internal[key] = oldVal | |||
} | |||
// set new values and apply deletes | |||
@@ -173,12 +209,21 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { | |||
delete(newSnapshot.internal, key) | |||
} | |||
} | |||
newSnapshot.updateSize() | |||
s.rootLock.Lock() | |||
if next.persisted != nil { | |||
s.rootPersisted = append(s.rootPersisted, next.persisted) | |||
} | |||
if next.persistedCallback != nil { | |||
s.persistedCallbacks = append(s.persistedCallbacks, next.persistedCallback) | |||
} | |||
// swap in new index snapshot | |||
newSnapshot.epoch = s.nextSnapshotEpoch | |||
s.nextSnapshotEpoch++ | |||
rootPrev := s.root | |||
s.root = newSnapshot | |||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) | |||
// release lock | |||
s.rootLock.Unlock() | |||
@@ -191,42 +236,113 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { | |||
return nil | |||
} | |||
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
// acquire lock | |||
func (s *Scorch) introducePersist(persist *persistIntroduction) { | |||
atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1) | |||
defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1) | |||
s.rootLock.Lock() | |||
root := s.root | |||
root.AddRef() | |||
nextSnapshotEpoch := s.nextSnapshotEpoch | |||
s.nextSnapshotEpoch++ | |||
s.rootLock.Unlock() | |||
// prepare new index snapshot | |||
currSize := len(s.root.segment) | |||
newSize := currSize + 1 - len(nextMerge.old) | |||
defer func() { _ = root.DecRef() }() | |||
newIndexSnapshot := &IndexSnapshot{ | |||
parent: s, | |||
epoch: nextSnapshotEpoch, | |||
segment: make([]*SegmentSnapshot, len(root.segment)), | |||
offsets: make([]uint64, len(root.offsets)), | |||
internal: make(map[string][]byte, len(root.internal)), | |||
refs: 1, | |||
creator: "introducePersist", | |||
} | |||
var docsToPersistCount, memSegments, fileSegments uint64 | |||
for i, segmentSnapshot := range root.segment { | |||
// see if this segment has been replaced | |||
if replacement, ok := persist.persisted[segmentSnapshot.id]; ok { | |||
newSegmentSnapshot := &SegmentSnapshot{ | |||
id: segmentSnapshot.id, | |||
segment: replacement, | |||
deleted: segmentSnapshot.deleted, | |||
cachedDocs: segmentSnapshot.cachedDocs, | |||
creator: "introducePersist", | |||
} | |||
newIndexSnapshot.segment[i] = newSegmentSnapshot | |||
delete(persist.persisted, segmentSnapshot.id) | |||
// update items persisted incase of a new segment snapshot | |||
atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count()) | |||
atomic.AddUint64(&s.stats.TotPersistedSegments, 1) | |||
fileSegments++ | |||
} else { | |||
newIndexSnapshot.segment[i] = root.segment[i] | |||
newIndexSnapshot.segment[i].segment.AddRef() | |||
if isMemorySegment(root.segment[i]) { | |||
docsToPersistCount += root.segment[i].Count() | |||
memSegments++ | |||
} else { | |||
fileSegments++ | |||
} | |||
} | |||
newIndexSnapshot.offsets[i] = root.offsets[i] | |||
} | |||
for k, v := range root.internal { | |||
newIndexSnapshot.internal[k] = v | |||
} | |||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) | |||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) | |||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) | |||
newIndexSnapshot.updateSize() | |||
s.rootLock.Lock() | |||
rootPrev := s.root | |||
s.root = newIndexSnapshot | |||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) | |||
s.rootLock.Unlock() | |||
// empty segments deletion | |||
if nextMerge.new == nil { | |||
newSize-- | |||
if rootPrev != nil { | |||
_ = rootPrev.DecRef() | |||
} | |||
close(persist.applied) | |||
} | |||
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1) | |||
defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1) | |||
s.rootLock.RLock() | |||
root := s.root | |||
root.AddRef() | |||
s.rootLock.RUnlock() | |||
defer func() { _ = root.DecRef() }() | |||
newSnapshot := &IndexSnapshot{ | |||
parent: s, | |||
segment: make([]*SegmentSnapshot, 0, newSize), | |||
offsets: make([]uint64, 0, newSize), | |||
internal: s.root.internal, | |||
epoch: s.nextSnapshotEpoch, | |||
internal: root.internal, | |||
refs: 1, | |||
creator: "introduceMerge", | |||
} | |||
s.nextSnapshotEpoch++ | |||
// iterate through current segments | |||
newSegmentDeleted := roaring.NewBitmap() | |||
var running uint64 | |||
for i := range s.root.segment { | |||
segmentID := s.root.segment[i].id | |||
var running, docsToPersistCount, memSegments, fileSegments uint64 | |||
for i := range root.segment { | |||
segmentID := root.segment[i].id | |||
if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { | |||
// this segment is going away, see if anything else was deleted since we started the merge | |||
if segSnapAtMerge != nil && s.root.segment[i].deleted != nil { | |||
if segSnapAtMerge != nil && root.segment[i].deleted != nil { | |||
// assume all these deletes are new | |||
deletedSince := s.root.segment[i].deleted | |||
deletedSince := root.segment[i].deleted | |||
// if we already knew about some of them, remove | |||
if segSnapAtMerge.deleted != nil { | |||
deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted) | |||
deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted) | |||
} | |||
deletedSinceItr := deletedSince.Iterator() | |||
for deletedSinceItr.HasNext() { | |||
@@ -240,18 +356,25 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
// segments left behind in old map after processing | |||
// the root segments would be the obsolete segment set | |||
delete(nextMerge.old, segmentID) | |||
} else if s.root.segment[i].LiveSize() > 0 { | |||
} else if root.segment[i].LiveSize() > 0 { | |||
// this segment is staying | |||
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ | |||
id: s.root.segment[i].id, | |||
segment: s.root.segment[i].segment, | |||
deleted: s.root.segment[i].deleted, | |||
cachedDocs: s.root.segment[i].cachedDocs, | |||
id: root.segment[i].id, | |||
segment: root.segment[i].segment, | |||
deleted: root.segment[i].deleted, | |||
cachedDocs: root.segment[i].cachedDocs, | |||
creator: root.segment[i].creator, | |||
}) | |||
s.root.segment[i].segment.AddRef() | |||
root.segment[i].segment.AddRef() | |||
newSnapshot.offsets = append(newSnapshot.offsets, running) | |||
running += s.root.segment[i].Count() | |||
running += root.segment[i].segment.Count() | |||
if isMemorySegment(root.segment[i]) { | |||
docsToPersistCount += root.segment[i].Count() | |||
memSegments++ | |||
} else { | |||
fileSegments++ | |||
} | |||
} | |||
} | |||
@@ -269,6 +392,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
} | |||
} | |||
} | |||
// In case where all the docs in the newly merged segment getting | |||
// deleted by the time we reach here, can skip the introduction. | |||
if nextMerge.new != nil && | |||
@@ -279,15 +403,35 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
segment: nextMerge.new, // take ownership for nextMerge.new's ref-count | |||
deleted: newSegmentDeleted, | |||
cachedDocs: &cachedDocs{cache: nil}, | |||
creator: "introduceMerge", | |||
}) | |||
newSnapshot.offsets = append(newSnapshot.offsets, running) | |||
atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1) | |||
switch nextMerge.new.(type) { | |||
case *zap.SegmentBase: | |||
docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality() | |||
memSegments++ | |||
case *zap.Segment: | |||
fileSegments++ | |||
} | |||
} | |||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) | |||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) | |||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) | |||
newSnapshot.AddRef() // 1 ref for the nextMerge.notify response | |||
// swap in new segment | |||
newSnapshot.updateSize() | |||
s.rootLock.Lock() | |||
// swap in new index snapshot | |||
newSnapshot.epoch = s.nextSnapshotEpoch | |||
s.nextSnapshotEpoch++ | |||
rootPrev := s.root | |||
s.root = newSnapshot | |||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) | |||
// release lock | |||
s.rootLock.Unlock() | |||
@@ -301,6 +445,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
} | |||
func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { | |||
atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1) | |||
defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1) | |||
if revertTo.snapshot == nil { | |||
err := fmt.Errorf("Cannot revert to a nil snapshot") | |||
revertTo.applied <- err | |||
@@ -318,9 +465,11 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { | |||
internal: revertTo.snapshot.internal, | |||
epoch: s.nextSnapshotEpoch, | |||
refs: 1, | |||
creator: "revertToSnapshot", | |||
} | |||
s.nextSnapshotEpoch++ | |||
var docsToPersistCount, memSegments, fileSegments uint64 | |||
// iterate through segments | |||
for i, segmentSnapshot := range revertTo.snapshot.segment { | |||
newSnapshot.segment[i] = &SegmentSnapshot{ | |||
@@ -328,21 +477,37 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { | |||
segment: segmentSnapshot.segment, | |||
deleted: segmentSnapshot.deleted, | |||
cachedDocs: segmentSnapshot.cachedDocs, | |||
creator: segmentSnapshot.creator, | |||
} | |||
newSnapshot.segment[i].segment.AddRef() | |||
// remove segment from ineligibleForRemoval map | |||
filename := zapFileName(segmentSnapshot.id) | |||
delete(s.ineligibleForRemoval, filename) | |||
if isMemorySegment(segmentSnapshot) { | |||
docsToPersistCount += segmentSnapshot.Count() | |||
memSegments++ | |||
} else { | |||
fileSegments++ | |||
} | |||
} | |||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) | |||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) | |||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) | |||
if revertTo.persisted != nil { | |||
s.rootPersisted = append(s.rootPersisted, revertTo.persisted) | |||
} | |||
newSnapshot.updateSize() | |||
// swap in new snapshot | |||
rootPrev := s.root | |||
s.root = newSnapshot | |||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) | |||
// release lock | |||
s.rootLock.Unlock() | |||
@@ -354,3 +519,12 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { | |||
return nil | |||
} | |||
func isMemorySegment(s *SegmentSnapshot) bool { | |||
switch s.segment.(type) { | |||
case *zap.SegmentBase: | |||
return true | |||
default: | |||
return false | |||
} | |||
} |
@@ -15,9 +15,7 @@ | |||
package scorch | |||
import ( | |||
"bytes" | |||
"encoding/json" | |||
"fmt" | |||
"os" | |||
"sync/atomic" | |||
@@ -40,16 +38,20 @@ func (s *Scorch) mergerLoop() { | |||
OUTER: | |||
for { | |||
atomic.AddUint64(&s.stats.TotFileMergeLoopBeg, 1) | |||
select { | |||
case <-s.closeCh: | |||
break OUTER | |||
default: | |||
// check to see if there is a new snapshot to persist | |||
s.rootLock.RLock() | |||
s.rootLock.Lock() | |||
ourSnapshot := s.root | |||
ourSnapshot.AddRef() | |||
s.rootLock.RUnlock() | |||
atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size())) | |||
atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch) | |||
s.rootLock.Unlock() | |||
if ourSnapshot.epoch != lastEpochMergePlanned { | |||
startTime := time.Now() | |||
@@ -57,12 +59,21 @@ OUTER: | |||
// lets get started | |||
err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) | |||
if err != nil { | |||
atomic.StoreUint64(&s.iStats.mergeEpoch, 0) | |||
if err == segment.ErrClosed { | |||
// index has been closed | |||
_ = ourSnapshot.DecRef() | |||
break OUTER | |||
} | |||
s.fireAsyncError(fmt.Errorf("merging err: %v", err)) | |||
_ = ourSnapshot.DecRef() | |||
atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1) | |||
continue OUTER | |||
} | |||
lastEpochMergePlanned = ourSnapshot.epoch | |||
atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch) | |||
s.fireEvent(EventKindMergerProgress, time.Since(startTime)) | |||
} | |||
_ = ourSnapshot.DecRef() | |||
@@ -88,7 +99,10 @@ OUTER: | |||
case <-ew.notifyCh: | |||
} | |||
} | |||
atomic.AddUint64(&s.stats.TotFileMergeLoopEnd, 1) | |||
} | |||
s.asyncTasks.Done() | |||
} | |||
@@ -105,6 +119,11 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, | |||
if err != nil { | |||
return &mergePlannerOptions, err | |||
} | |||
err = mergeplan.ValidateMergePlannerOptions(&mergePlannerOptions) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
return &mergePlannerOptions, nil | |||
} | |||
@@ -119,32 +138,45 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, | |||
} | |||
} | |||
atomic.AddUint64(&s.stats.TotFileMergePlan, 1) | |||
// give this list to the planner | |||
resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options) | |||
if err != nil { | |||
atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1) | |||
return fmt.Errorf("merge planning err: %v", err) | |||
} | |||
if resultMergePlan == nil { | |||
// nothing to do | |||
atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1) | |||
return nil | |||
} | |||
atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1) | |||
atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks))) | |||
// process tasks in serial for now | |||
var notifications []chan *IndexSnapshot | |||
for _, task := range resultMergePlan.Tasks { | |||
if len(task.Segments) == 0 { | |||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1) | |||
continue | |||
} | |||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments))) | |||
oldMap := make(map[uint64]*SegmentSnapshot) | |||
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) | |||
segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments)) | |||
docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) | |||
for _, planSegment := range task.Segments { | |||
if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { | |||
oldMap[segSnapshot.id] = segSnapshot | |||
if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok { | |||
if segSnapshot.LiveSize() == 0 { | |||
atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1) | |||
oldMap[segSnapshot.id] = nil | |||
} else { | |||
segmentsToMerge = append(segmentsToMerge, zapSeg) | |||
@@ -155,32 +187,53 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, | |||
} | |||
var oldNewDocNums map[uint64][]uint64 | |||
var segment segment.Segment | |||
var seg segment.Segment | |||
if len(segmentsToMerge) > 0 { | |||
filename := zapFileName(newSegmentID) | |||
s.markIneligibleForRemoval(filename) | |||
path := s.path + string(os.PathSeparator) + filename | |||
newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) | |||
fileMergeZapStartTime := time.Now() | |||
atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) | |||
newDocNums, _, err := zap.Merge(segmentsToMerge, docsToDrop, path, | |||
DefaultChunkFactor, s.closeCh, s) | |||
atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) | |||
fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) | |||
atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime) | |||
if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime { | |||
atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime) | |||
} | |||
if err != nil { | |||
s.unmarkIneligibleForRemoval(filename) | |||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) | |||
if err == segment.ErrClosed { | |||
return err | |||
} | |||
return fmt.Errorf("merging failed: %v", err) | |||
} | |||
segment, err = zap.Open(path) | |||
seg, err = zap.Open(path) | |||
if err != nil { | |||
s.unmarkIneligibleForRemoval(filename) | |||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) | |||
return err | |||
} | |||
oldNewDocNums = make(map[uint64][]uint64) | |||
for i, segNewDocNums := range newDocNums { | |||
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums | |||
} | |||
atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge))) | |||
} | |||
sm := &segmentMerge{ | |||
id: newSegmentID, | |||
old: oldMap, | |||
oldNewDocNums: oldNewDocNums, | |||
new: segment, | |||
new: seg, | |||
notify: make(chan *IndexSnapshot, 1), | |||
} | |||
notifications = append(notifications, sm.notify) | |||
@@ -188,21 +241,28 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, | |||
// give it to the introducer | |||
select { | |||
case <-s.closeCh: | |||
_ = segment.Close() | |||
return nil | |||
_ = seg.Close() | |||
return segment.ErrClosed | |||
case s.merges <- sm: | |||
atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1) | |||
} | |||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1) | |||
} | |||
for _, notification := range notifications { | |||
select { | |||
case <-s.closeCh: | |||
return nil | |||
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1) | |||
return segment.ErrClosed | |||
case newSnapshot := <-notification: | |||
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) | |||
if newSnapshot != nil { | |||
_ = newSnapshot.DecRef() | |||
} | |||
} | |||
} | |||
return nil | |||
} | |||
@@ -219,44 +279,48 @@ type segmentMerge struct { | |||
// into the root | |||
func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, | |||
sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int, | |||
chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) { | |||
var br bytes.Buffer | |||
chunkFactor uint32) (*IndexSnapshot, uint64, error) { | |||
atomic.AddUint64(&s.stats.TotMemMergeBeg, 1) | |||
cr := zap.NewCountHashWriter(&br) | |||
memMergeZapStartTime := time.Now() | |||
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, | |||
docValueOffset, dictLocs, fieldsInv, fieldsMap, err := | |||
zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) | |||
if err != nil { | |||
return 0, nil, 0, err | |||
} | |||
sb, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, | |||
fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset, | |||
docValueOffset, dictLocs) | |||
if err != nil { | |||
return 0, nil, 0, err | |||
} | |||
atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1) | |||
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) | |||
filename := zapFileName(newSegmentID) | |||
path := s.path + string(os.PathSeparator) + filename | |||
err = zap.PersistSegmentBase(sb, path) | |||
newDocNums, _, err := | |||
zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh, s) | |||
atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) | |||
memMergeZapTime := uint64(time.Since(memMergeZapStartTime)) | |||
atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime) | |||
if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime { | |||
atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime) | |||
} | |||
if err != nil { | |||
return 0, nil, 0, err | |||
atomic.AddUint64(&s.stats.TotMemMergeErr, 1) | |||
return nil, 0, err | |||
} | |||
segment, err := zap.Open(path) | |||
seg, err := zap.Open(path) | |||
if err != nil { | |||
return 0, nil, 0, err | |||
atomic.AddUint64(&s.stats.TotMemMergeErr, 1) | |||
return nil, 0, err | |||
} | |||
// update persisted stats | |||
atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count()) | |||
atomic.AddUint64(&s.stats.TotPersistedSegments, 1) | |||
sm := &segmentMerge{ | |||
id: newSegmentID, | |||
old: make(map[uint64]*SegmentSnapshot), | |||
oldNewDocNums: make(map[uint64][]uint64), | |||
new: segment, | |||
new: seg, | |||
notify: make(chan *IndexSnapshot, 1), | |||
} | |||
@@ -268,15 +332,21 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, | |||
select { // send to introducer | |||
case <-s.closeCh: | |||
_ = segment.DecRef() | |||
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? | |||
_ = seg.DecRef() | |||
return nil, 0, segment.ErrClosed | |||
case s.merges <- sm: | |||
} | |||
select { // wait for introduction to complete | |||
case <-s.closeCh: | |||
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? | |||
return nil, 0, segment.ErrClosed | |||
case newSnapshot := <-sm.notify: | |||
return numDocs, newSnapshot, newSegmentID, nil | |||
atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) | |||
atomic.AddUint64(&s.stats.TotMemMergeDone, 1) | |||
return newSnapshot, newSegmentID, nil | |||
} | |||
} | |||
func (s *Scorch) ReportBytesWritten(bytesWritten uint64) { | |||
atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, bytesWritten) | |||
} |
@@ -18,6 +18,7 @@ | |||
package mergeplan | |||
import ( | |||
"errors" | |||
"fmt" | |||
"math" | |||
"sort" | |||
@@ -115,7 +116,15 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 { | |||
return o.FloorSegmentSize | |||
} | |||
// Suggested default options. | |||
// MaxSegmentSizeLimit represents the maximum size of a segment, | |||
// this limit comes with hit-1 optimisation/max encoding limit uint31. | |||
const MaxSegmentSizeLimit = 1<<31 - 1 | |||
// ErrMaxSegmentSizeTooLarge is returned when the size of the segment | |||
// exceeds the MaxSegmentSizeLimit | |||
var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit") | |||
// DefaultMergePlanOptions suggests the default options. | |||
var DefaultMergePlanOptions = MergePlanOptions{ | |||
MaxSegmentsPerTier: 10, | |||
MaxSegmentSize: 5000000, | |||
@@ -208,14 +217,14 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { | |||
if len(roster) > 0 { | |||
rosterScore := scoreSegments(roster, o) | |||
if len(bestRoster) <= 0 || rosterScore < bestRosterScore { | |||
if len(bestRoster) == 0 || rosterScore < bestRosterScore { | |||
bestRoster = roster | |||
bestRosterScore = rosterScore | |||
} | |||
} | |||
} | |||
if len(bestRoster) <= 0 { | |||
if len(bestRoster) == 0 { | |||
return rv, nil | |||
} | |||
@@ -367,3 +376,11 @@ func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan) | |||
return strings.Join(rv, "\n") | |||
} | |||
// ValidateMergePlannerOptions validates the merge planner options | |||
func ValidateMergePlannerOptions(options *MergePlanOptions) error { | |||
if options.MaxSegmentSize > MaxSegmentSizeLimit { | |||
return ErrMaxSegmentSizeTooLarge | |||
} | |||
return nil | |||
} |
@@ -0,0 +1,420 @@ | |||
// Copyright (c) 2018 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package scorch | |||
import ( | |||
"fmt" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/blevesearch/bleve/index/scorch/segment/zap" | |||
) | |||
var OptimizeConjunction = true | |||
var OptimizeConjunctionUnadorned = true | |||
var OptimizeDisjunctionUnadorned = true | |||
func (s *IndexSnapshotTermFieldReader) Optimize(kind string, | |||
octx index.OptimizableContext) (index.OptimizableContext, error) { | |||
if OptimizeConjunction && kind == "conjunction" { | |||
return s.optimizeConjunction(octx) | |||
} | |||
if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" { | |||
return s.optimizeConjunctionUnadorned(octx) | |||
} | |||
if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" { | |||
return s.optimizeDisjunctionUnadorned(octx) | |||
} | |||
return octx, nil | |||
} | |||
var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256) | |||
// ---------------------------------------------------------------- | |||
func (s *IndexSnapshotTermFieldReader) optimizeConjunction( | |||
octx index.OptimizableContext) (index.OptimizableContext, error) { | |||
if octx == nil { | |||
octx = &OptimizeTFRConjunction{snapshot: s.snapshot} | |||
} | |||
o, ok := octx.(*OptimizeTFRConjunction) | |||
if !ok { | |||
return octx, nil | |||
} | |||
if o.snapshot != s.snapshot { | |||
return nil, fmt.Errorf("tried to optimize conjunction across different snapshots") | |||
} | |||
o.tfrs = append(o.tfrs, s) | |||
return o, nil | |||
} | |||
type OptimizeTFRConjunction struct { | |||
snapshot *IndexSnapshot | |||
tfrs []*IndexSnapshotTermFieldReader | |||
} | |||
func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { | |||
if len(o.tfrs) <= 1 { | |||
return nil, nil | |||
} | |||
for i := range o.snapshot.segment { | |||
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator) | |||
if !ok || itr0.ActualBM == nil { | |||
continue | |||
} | |||
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator) | |||
if !ok || itr1.ActualBM == nil { | |||
continue | |||
} | |||
bm := roaring.And(itr0.ActualBM, itr1.ActualBM) | |||
for _, tfr := range o.tfrs[2:] { | |||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
if !ok || itr.ActualBM == nil { | |||
continue | |||
} | |||
bm.And(itr.ActualBM) | |||
} | |||
// in this conjunction optimization, the postings iterators | |||
// will all share the same AND'ed together actual bitmap. The | |||
// regular conjunction searcher machinery will still be used, | |||
// but the underlying bitmap will be smaller. | |||
for _, tfr := range o.tfrs { | |||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
if ok && itr.ActualBM != nil { | |||
itr.ActualBM = bm | |||
itr.Actual = bm.Iterator() | |||
} | |||
} | |||
} | |||
return nil, nil | |||
} | |||
// ---------------------------------------------------------------- | |||
// An "unadorned" conjunction optimization is appropriate when | |||
// additional or subsidiary information like freq-norm's and | |||
// term-vectors are not required, and instead only the internal-id's | |||
// are needed. | |||
func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned( | |||
octx index.OptimizableContext) (index.OptimizableContext, error) { | |||
if octx == nil { | |||
octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot} | |||
} | |||
o, ok := octx.(*OptimizeTFRConjunctionUnadorned) | |||
if !ok { | |||
return nil, nil | |||
} | |||
if o.snapshot != s.snapshot { | |||
return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots") | |||
} | |||
o.tfrs = append(o.tfrs, s) | |||
return o, nil | |||
} | |||
type OptimizeTFRConjunctionUnadorned struct { | |||
snapshot *IndexSnapshot | |||
tfrs []*IndexSnapshotTermFieldReader | |||
} | |||
var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>") | |||
var OptimizeTFRConjunctionUnadornedField = "*" | |||
// Finish of an unadorned conjunction optimization will compute a | |||
// termFieldReader with an "actual" bitmap that represents the | |||
// constituent bitmaps AND'ed together. This termFieldReader cannot | |||
// provide any freq-norm or termVector associated information. | |||
func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) { | |||
if len(o.tfrs) <= 1 { | |||
return nil, nil | |||
} | |||
// We use an artificial term and field because the optimized | |||
// termFieldReader can represent multiple terms and fields. | |||
oTFR := &IndexSnapshotTermFieldReader{ | |||
term: OptimizeTFRConjunctionUnadornedTerm, | |||
field: OptimizeTFRConjunctionUnadornedField, | |||
snapshot: o.snapshot, | |||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), | |||
segmentOffset: 0, | |||
includeFreq: false, | |||
includeNorm: false, | |||
includeTermVectors: false, | |||
} | |||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists. | |||
OUTER: | |||
for i := range o.snapshot.segment { | |||
actualBMs = actualBMs[:0] | |||
var docNum1HitLast uint64 | |||
var docNum1HitLastOk bool | |||
for _, tfr := range o.tfrs { | |||
if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok { | |||
// An empty postings iterator means the entire AND is empty. | |||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
continue OUTER | |||
} | |||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
if !ok { | |||
// We optimize zap postings iterators only. | |||
return nil, nil | |||
} | |||
// If the postings iterator is "1-hit" optimized, then we | |||
// can perform several optimizations up-front here. | |||
docNum1Hit, ok := itr.DocNum1Hit() | |||
if ok { | |||
if docNum1Hit == zap.DocNum1HitFinished { | |||
// An empty docNum here means the entire AND is empty. | |||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
continue OUTER | |||
} | |||
if docNum1HitLastOk && docNum1HitLast != docNum1Hit { | |||
// The docNum1Hit doesn't match the previous | |||
// docNum1HitLast, so the entire AND is empty. | |||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
continue OUTER | |||
} | |||
docNum1HitLast = docNum1Hit | |||
docNum1HitLastOk = true | |||
continue | |||
} | |||
if itr.ActualBM == nil { | |||
// An empty actual bitmap means the entire AND is empty. | |||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
continue OUTER | |||
} | |||
// Collect the actual bitmap for more processing later. | |||
actualBMs = append(actualBMs, itr.ActualBM) | |||
} | |||
if docNum1HitLastOk { | |||
// We reach here if all the 1-hit optimized posting | |||
// iterators had the same 1-hit docNum, so we can check if | |||
// our collected actual bitmaps also have that docNum. | |||
for _, bm := range actualBMs { | |||
if !bm.Contains(uint32(docNum1HitLast)) { | |||
// The docNum1Hit isn't in one of our actual | |||
// bitmaps, so the entire AND is empty. | |||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
continue OUTER | |||
} | |||
} | |||
// The actual bitmaps and docNum1Hits all contain or have | |||
// the same 1-hit docNum, so that's our AND'ed result. | |||
oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit( | |||
docNum1HitLast, zap.NormBits1Hit, false, false) | |||
if err != nil { | |||
return nil, nil | |||
} | |||
continue OUTER | |||
} | |||
if len(actualBMs) == 0 { | |||
// If we've collected no actual bitmaps at this point, | |||
// then the entire AND is empty. | |||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
continue OUTER | |||
} | |||
if len(actualBMs) == 1 { | |||
// If we've only 1 actual bitmap, then that's our result. | |||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( | |||
actualBMs[0], false, false) | |||
if err != nil { | |||
return nil, nil | |||
} | |||
continue OUTER | |||
} | |||
// Else, AND together our collected bitmaps as our result. | |||
bm := roaring.And(actualBMs[0], actualBMs[1]) | |||
for _, actualBM := range actualBMs[2:] { | |||
bm.And(actualBM) | |||
} | |||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( | |||
bm, false, false) | |||
if err != nil { | |||
return nil, nil | |||
} | |||
} | |||
return oTFR, nil | |||
} | |||
// ---------------------------------------------------------------- | |||
// An "unadorned" disjunction optimization is appropriate when | |||
// additional or subsidiary information like freq-norm's and | |||
// term-vectors are not required, and instead only the internal-id's | |||
// are needed. | |||
func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( | |||
octx index.OptimizableContext) (index.OptimizableContext, error) { | |||
if octx == nil { | |||
octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot} | |||
} | |||
o, ok := octx.(*OptimizeTFRDisjunctionUnadorned) | |||
if !ok { | |||
return nil, nil | |||
} | |||
if o.snapshot != s.snapshot { | |||
return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots") | |||
} | |||
o.tfrs = append(o.tfrs, s) | |||
return o, nil | |||
} | |||
type OptimizeTFRDisjunctionUnadorned struct { | |||
snapshot *IndexSnapshot | |||
tfrs []*IndexSnapshotTermFieldReader | |||
} | |||
var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>") | |||
var OptimizeTFRDisjunctionUnadornedField = "*" | |||
// Finish of an unadorned disjunction optimization will compute a | |||
// termFieldReader with an "actual" bitmap that represents the | |||
// constituent bitmaps OR'ed together. This termFieldReader cannot | |||
// provide any freq-norm or termVector associated information. | |||
func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) { | |||
if len(o.tfrs) <= 1 { | |||
return nil, nil | |||
} | |||
for i := range o.snapshot.segment { | |||
var cMax uint64 | |||
for _, tfr := range o.tfrs { | |||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
if !ok { | |||
return nil, nil | |||
} | |||
if itr.ActualBM != nil { | |||
c := itr.ActualBM.GetCardinality() | |||
if cMax < c { | |||
cMax = c | |||
} | |||
} | |||
} | |||
// Heuristic to skip the optimization if all the constituent | |||
// bitmaps are too small, where the processing & resource | |||
// overhead to create the OR'ed bitmap outweighs the benefit. | |||
if cMax < OptimizeDisjunctionUnadornedMinChildCardinality { | |||
return nil, nil | |||
} | |||
} | |||
// We use an artificial term and field because the optimized | |||
// termFieldReader can represent multiple terms and fields. | |||
oTFR := &IndexSnapshotTermFieldReader{ | |||
term: OptimizeTFRDisjunctionUnadornedTerm, | |||
field: OptimizeTFRDisjunctionUnadornedField, | |||
snapshot: o.snapshot, | |||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), | |||
segmentOffset: 0, | |||
includeFreq: false, | |||
includeNorm: false, | |||
includeTermVectors: false, | |||
} | |||
var docNums []uint32 // Collected docNum's from 1-hit posting lists. | |||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists. | |||
for i := range o.snapshot.segment { | |||
docNums = docNums[:0] | |||
actualBMs = actualBMs[:0] | |||
for _, tfr := range o.tfrs { | |||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
if !ok { | |||
return nil, nil | |||
} | |||
docNum, ok := itr.DocNum1Hit() | |||
if ok { | |||
docNums = append(docNums, uint32(docNum)) | |||
continue | |||
} | |||
if itr.ActualBM != nil { | |||
actualBMs = append(actualBMs, itr.ActualBM) | |||
} | |||
} | |||
var bm *roaring.Bitmap | |||
if len(actualBMs) > 2 { | |||
bm = roaring.HeapOr(actualBMs...) | |||
} else if len(actualBMs) == 2 { | |||
bm = roaring.Or(actualBMs[0], actualBMs[1]) | |||
} else if len(actualBMs) == 1 { | |||
bm = actualBMs[0].Clone() | |||
} | |||
if bm == nil { | |||
bm = roaring.New() | |||
} | |||
bm.AddMany(docNums) | |||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false) | |||
if err != nil { | |||
return nil, nil | |||
} | |||
} | |||
return oTFR, nil | |||
} |
@@ -16,9 +16,12 @@ package scorch | |||
import ( | |||
"bytes" | |||
"encoding/binary" | |||
"encoding/json" | |||
"fmt" | |||
"io/ioutil" | |||
"log" | |||
"math" | |||
"os" | |||
"path/filepath" | |||
"strconv" | |||
@@ -27,23 +30,57 @@ import ( | |||
"time" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/blevesearch/bleve/index/scorch/segment/zap" | |||
"github.com/boltdb/bolt" | |||
bolt "github.com/etcd-io/bbolt" | |||
) | |||
var DefaultChunkFactor uint32 = 1024 | |||
// Arbitrary number, need to make it configurable. | |||
// Lower values like 10/making persister really slow | |||
// doesn't work well as it is creating more files to | |||
// persist for in next persist iteration and spikes the # FDs. | |||
// Ideal value should let persister also proceed at | |||
// an optimum pace so that the merger can skip | |||
// many intermediate snapshots. | |||
// This needs to be based on empirical data. | |||
// TODO - may need to revisit this approach/value. | |||
var epochDistance = uint64(5) | |||
// DefaultPersisterNapTimeMSec is kept to zero as this helps in direct | |||
// persistence of segments with the default safe batch option. | |||
// If the default safe batch option results in high number of | |||
// files on disk, then users may initialise this configuration parameter | |||
// with higher values so that the persister will nap a bit within it's | |||
// work loop to favour better in-memory merging of segments to result | |||
// in fewer segment files on disk. But that may come with an indexing | |||
// performance overhead. | |||
// Unsafe batch users are advised to override this to higher value | |||
// for better performance especially with high data density. | |||
var DefaultPersisterNapTimeMSec int = 0 // ms | |||
// DefaultPersisterNapUnderNumFiles helps in controlling the pace of | |||
// persister. At times of a slow merger progress with heavy file merging | |||
// operations, its better to pace down the persister for letting the merger | |||
// to catch up within a range defined by this parameter. | |||
// Fewer files on disk (as per the merge plan) would result in keeping the | |||
// file handle usage under limit, faster disk merger and a healthier index. | |||
// Its been observed that such a loosely sync'ed introducer-persister-merger | |||
// trio results in better overall performance. | |||
var DefaultPersisterNapUnderNumFiles int = 1000 | |||
var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64 | |||
type persisterOptions struct { | |||
// PersisterNapTimeMSec controls the wait/delay injected into | |||
// persistence workloop to improve the chances for | |||
// a healthier and heavier in-memory merging | |||
PersisterNapTimeMSec int | |||
// PersisterNapTimeMSec > 0, and the number of files is less than | |||
// PersisterNapUnderNumFiles, then the persister will sleep | |||
// PersisterNapTimeMSec amount of time to improve the chances for | |||
// a healthier and heavier in-memory merging | |||
PersisterNapUnderNumFiles int | |||
// MemoryPressurePauseThreshold let persister to have a better leeway | |||
// for prudently performing the memory merge of segments on a memory | |||
// pressure situation. Here the config value is an upper threshold | |||
// for the number of paused application threads. The default value would | |||
// be a very high number to always favour the merging of memory segments. | |||
MemoryPressurePauseThreshold uint64 | |||
} | |||
type notificationChan chan struct{} | |||
@@ -53,8 +90,17 @@ func (s *Scorch) persisterLoop() { | |||
var persistWatchers []*epochWatcher | |||
var lastPersistedEpoch, lastMergedEpoch uint64 | |||
var ew *epochWatcher | |||
po, err := s.parsePersisterOptions() | |||
if err != nil { | |||
s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err)) | |||
s.asyncTasks.Done() | |||
return | |||
} | |||
OUTER: | |||
for { | |||
atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1) | |||
select { | |||
case <-s.closeCh: | |||
break OUTER | |||
@@ -65,11 +111,13 @@ OUTER: | |||
if ew != nil && ew.epoch > lastMergedEpoch { | |||
lastMergedEpoch = ew.epoch | |||
} | |||
persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, | |||
&lastMergedEpoch, persistWatchers) | |||
lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, | |||
lastMergedEpoch, persistWatchers, po) | |||
var ourSnapshot *IndexSnapshot | |||
var ourPersisted []chan error | |||
var ourPersistedCallbacks []index.BatchCallback | |||
// check to see if there is a new snapshot to persist | |||
s.rootLock.Lock() | |||
@@ -78,13 +126,17 @@ OUTER: | |||
ourSnapshot.AddRef() | |||
ourPersisted = s.rootPersisted | |||
s.rootPersisted = nil | |||
ourPersistedCallbacks = s.persistedCallbacks | |||
s.persistedCallbacks = nil | |||
atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size())) | |||
atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch) | |||
} | |||
s.rootLock.Unlock() | |||
if ourSnapshot != nil { | |||
startTime := time.Now() | |||
err := s.persistSnapshot(ourSnapshot) | |||
err := s.persistSnapshot(ourSnapshot, po) | |||
for _, ch := range ourPersisted { | |||
if err != nil { | |||
ch <- err | |||
@@ -92,10 +144,22 @@ OUTER: | |||
close(ch) | |||
} | |||
if err != nil { | |||
atomic.StoreUint64(&s.iStats.persistEpoch, 0) | |||
if err == segment.ErrClosed { | |||
// index has been closed | |||
_ = ourSnapshot.DecRef() | |||
break OUTER | |||
} | |||
s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err)) | |||
_ = ourSnapshot.DecRef() | |||
atomic.AddUint64(&s.stats.TotPersistLoopErr, 1) | |||
continue OUTER | |||
} | |||
for i := range ourPersistedCallbacks { | |||
ourPersistedCallbacks[i](err) | |||
} | |||
atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch) | |||
lastPersistedEpoch = ourSnapshot.epoch | |||
for _, ew := range persistWatchers { | |||
@@ -115,6 +179,8 @@ OUTER: | |||
s.fireEvent(EventKindPersisterProgress, time.Since(startTime)) | |||
if changed { | |||
s.removeOldData() | |||
atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1) | |||
continue OUTER | |||
} | |||
} | |||
@@ -133,17 +199,21 @@ OUTER: | |||
s.removeOldData() // might as well cleanup while waiting | |||
atomic.AddUint64(&s.stats.TotPersistLoopWait, 1) | |||
select { | |||
case <-s.closeCh: | |||
break OUTER | |||
case <-w.notifyCh: | |||
// woken up, next loop should pick up work | |||
continue OUTER | |||
atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1) | |||
case ew = <-s.persisterNotifier: | |||
// if the watchers are already caught up then let them wait, | |||
// else let them continue to do the catch up | |||
persistWatchers = append(persistWatchers, ew) | |||
} | |||
atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1) | |||
} | |||
} | |||
@@ -160,38 +230,95 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, | |||
return watchersNext | |||
} | |||
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64, | |||
persistWatchers []*epochWatcher) []*epochWatcher { | |||
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64, | |||
persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) { | |||
// first, let the watchers proceed if they lag behind | |||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) | |||
// check the merger lag by counting the segment files on disk, | |||
// On finding fewer files on disk, persister takes a short pause | |||
// for sufficient in-memory segments to pile up for the next | |||
// memory merge cum persist loop. | |||
// On finding too many files on disk, persister pause until the merger | |||
// catches up to reduce the segment file count under the threshold. | |||
// But if there is memory pressure, then skip this sleep maneuvers. | |||
numFilesOnDisk, _ := s.diskFileStats() | |||
if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && | |||
po.PersisterNapTimeMSec > 0 && s.paused() == 0 { | |||
select { | |||
case <-s.closeCh: | |||
case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)): | |||
atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1) | |||
case ew := <-s.persisterNotifier: | |||
// unblock the merger in meantime | |||
persistWatchers = append(persistWatchers, ew) | |||
lastMergedEpoch = ew.epoch | |||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) | |||
atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1) | |||
} | |||
return lastMergedEpoch, persistWatchers | |||
} | |||
OUTER: | |||
// check for slow merger and await until the merger catch up | |||
for lastPersistedEpoch > *lastMergedEpoch+epochDistance { | |||
for po.PersisterNapUnderNumFiles > 0 && | |||
numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) && | |||
lastMergedEpoch < lastPersistedEpoch { | |||
atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1) | |||
select { | |||
case <-s.closeCh: | |||
break OUTER | |||
case ew := <-s.persisterNotifier: | |||
persistWatchers = append(persistWatchers, ew) | |||
*lastMergedEpoch = ew.epoch | |||
lastMergedEpoch = ew.epoch | |||
} | |||
atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1) | |||
// let the watchers proceed if they lag behind | |||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) | |||
numFilesOnDisk, _ = s.diskFileStats() | |||
} | |||
return persistWatchers | |||
return lastMergedEpoch, persistWatchers | |||
} | |||
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { | |||
persisted, err := s.persistSnapshotMaybeMerge(snapshot) | |||
if err != nil { | |||
return err | |||
func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { | |||
po := persisterOptions{ | |||
PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, | |||
PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, | |||
MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold, | |||
} | |||
if persisted { | |||
return nil | |||
if v, ok := s.config["scorchPersisterOptions"]; ok { | |||
b, err := json.Marshal(v) | |||
if err != nil { | |||
return &po, err | |||
} | |||
err = json.Unmarshal(b, &po) | |||
if err != nil { | |||
return &po, err | |||
} | |||
} | |||
return &po, nil | |||
} | |||
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot, | |||
po *persisterOptions) error { | |||
// Perform in-memory segment merging only when the memory pressure is | |||
// below the configured threshold, else the persister performs the | |||
// direct persistence of segments. | |||
if s.paused() < po.MemoryPressurePauseThreshold { | |||
persisted, err := s.persistSnapshotMaybeMerge(snapshot) | |||
if err != nil { | |||
return err | |||
} | |||
if persisted { | |||
return nil | |||
} | |||
} | |||
return s.persistSnapshotDirect(snapshot) | |||
@@ -224,7 +351,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( | |||
return false, nil | |||
} | |||
_, newSnapshot, newSegmentID, err := s.mergeSegmentBases( | |||
newSnapshot, newSegmentID, err := s.mergeSegmentBases( | |||
snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor) | |||
if err != nil { | |||
return false, err | |||
@@ -249,6 +376,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( | |||
segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)), | |||
internal: snapshot.internal, | |||
epoch: snapshot.epoch, | |||
creator: "persistSnapshotMaybeMerge", | |||
} | |||
// copy to the equiv the segments that weren't replaced | |||
@@ -301,6 +429,22 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { | |||
return err | |||
} | |||
// persist meta values | |||
metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey) | |||
if err != nil { | |||
return err | |||
} | |||
err = metaBucket.Put([]byte("type"), []byte(zap.Type)) | |||
if err != nil { | |||
return err | |||
} | |||
buf := make([]byte, binary.MaxVarintLen32) | |||
binary.BigEndian.PutUint32(buf, zap.Version) | |||
err = metaBucket.Put([]byte("version"), buf) | |||
if err != nil { | |||
return err | |||
} | |||
// persist internal values | |||
internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey) | |||
if err != nil { | |||
@@ -390,44 +534,21 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { | |||
} | |||
} | |||
s.rootLock.Lock() | |||
newIndexSnapshot := &IndexSnapshot{ | |||
parent: s, | |||
epoch: s.nextSnapshotEpoch, | |||
segment: make([]*SegmentSnapshot, len(s.root.segment)), | |||
offsets: make([]uint64, len(s.root.offsets)), | |||
internal: make(map[string][]byte, len(s.root.internal)), | |||
refs: 1, | |||
} | |||
s.nextSnapshotEpoch++ | |||
for i, segmentSnapshot := range s.root.segment { | |||
// see if this segment has been replaced | |||
if replacement, ok := newSegments[segmentSnapshot.id]; ok { | |||
newSegmentSnapshot := &SegmentSnapshot{ | |||
id: segmentSnapshot.id, | |||
segment: replacement, | |||
deleted: segmentSnapshot.deleted, | |||
cachedDocs: segmentSnapshot.cachedDocs, | |||
} | |||
newIndexSnapshot.segment[i] = newSegmentSnapshot | |||
delete(newSegments, segmentSnapshot.id) | |||
// update items persisted incase of a new segment snapshot | |||
atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count()) | |||
} else { | |||
newIndexSnapshot.segment[i] = s.root.segment[i] | |||
newIndexSnapshot.segment[i].segment.AddRef() | |||
} | |||
newIndexSnapshot.offsets[i] = s.root.offsets[i] | |||
persist := &persistIntroduction{ | |||
persisted: newSegments, | |||
applied: make(notificationChan), | |||
} | |||
for k, v := range s.root.internal { | |||
newIndexSnapshot.internal[k] = v | |||
select { | |||
case <-s.closeCh: | |||
return segment.ErrClosed | |||
case s.persists <- persist: | |||
} | |||
rootPrev := s.root | |||
s.root = newIndexSnapshot | |||
s.rootLock.Unlock() | |||
if rootPrev != nil { | |||
_ = rootPrev.DecRef() | |||
select { | |||
case <-s.closeCh: | |||
return segment.ErrClosed | |||
case <-persist.applied: | |||
} | |||
} | |||
@@ -462,6 +583,7 @@ var boltSnapshotsBucket = []byte{'s'} | |||
var boltPathKey = []byte{'p'} | |||
var boltDeletedKey = []byte{'d'} | |||
var boltInternalKey = []byte{'i'} | |||
var boltMetaDataKey = []byte{'m'} | |||
func (s *Scorch) loadFromBolt() error { | |||
return s.rootBolt.View(func(tx *bolt.Tx) error { | |||
@@ -478,19 +600,19 @@ func (s *Scorch) loadFromBolt() error { | |||
continue | |||
} | |||
if foundRoot { | |||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) | |||
s.AddEligibleForRemoval(snapshotEpoch) | |||
continue | |||
} | |||
snapshot := snapshots.Bucket(k) | |||
if snapshot == nil { | |||
log.Printf("snapshot key, but bucket missing %x, continuing", k) | |||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) | |||
s.AddEligibleForRemoval(snapshotEpoch) | |||
continue | |||
} | |||
indexSnapshot, err := s.loadSnapshot(snapshot) | |||
if err != nil { | |||
log.Printf("unable to load snapshot, %v, continuing", err) | |||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) | |||
s.AddEligibleForRemoval(snapshotEpoch) | |||
continue | |||
} | |||
indexSnapshot.epoch = snapshotEpoch | |||
@@ -500,13 +622,16 @@ func (s *Scorch) loadFromBolt() error { | |||
return err | |||
} | |||
s.nextSegmentID++ | |||
s.nextSnapshotEpoch = snapshotEpoch + 1 | |||
s.rootLock.Lock() | |||
if s.root != nil { | |||
_ = s.root.DecRef() | |||
} | |||
s.nextSnapshotEpoch = snapshotEpoch + 1 | |||
rootPrev := s.root | |||
s.root = indexSnapshot | |||
s.rootLock.Unlock() | |||
if rootPrev != nil { | |||
_ = rootPrev.DecRef() | |||
} | |||
foundRoot = true | |||
} | |||
return nil | |||
@@ -524,7 +649,7 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { | |||
snapshotKey := segment.EncodeUvarintAscending(nil, epoch) | |||
snapshot := snapshots.Bucket(snapshotKey) | |||
if snapshot == nil { | |||
return nil | |||
return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch) | |||
} | |||
rv, err = s.loadSnapshot(snapshot) | |||
return err | |||
@@ -536,12 +661,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { | |||
} | |||
func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { | |||
rv := &IndexSnapshot{ | |||
parent: s, | |||
internal: make(map[string][]byte), | |||
refs: 1, | |||
creator: "loadSnapshot", | |||
} | |||
var running uint64 | |||
c := snapshot.Cursor() | |||
for k, _ := c.First(); k != nil; k, _ = c.Next() { | |||
@@ -556,7 +682,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { | |||
_ = rv.DecRef() | |||
return nil, err | |||
} | |||
} else { | |||
} else if k[0] != boltMetaDataKey[0] { | |||
segmentBucket := snapshot.Bucket(k) | |||
if segmentBucket == nil { | |||
_ = rv.DecRef() | |||
@@ -577,6 +703,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { | |||
running += segmentSnapshot.segment.Count() | |||
} | |||
} | |||
return rv, nil | |||
} | |||
@@ -604,7 +731,9 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro | |||
_ = segment.Close() | |||
return nil, fmt.Errorf("error reading deleted bytes: %v", err) | |||
} | |||
rv.deleted = deletedBitmap | |||
if !deletedBitmap.IsEmpty() { | |||
rv.deleted = deletedBitmap | |||
} | |||
} | |||
return rv, nil | |||
@@ -643,14 +772,14 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { | |||
return 0, err | |||
} | |||
if len(persistedEpochs) <= NumSnapshotsToKeep { | |||
if len(persistedEpochs) <= s.numSnapshotsToKeep { | |||
// we need to keep everything | |||
return 0, nil | |||
} | |||
// make a map of epochs to protect from deletion | |||
protectedEpochs := make(map[uint64]struct{}, NumSnapshotsToKeep) | |||
for _, epoch := range persistedEpochs[0:NumSnapshotsToKeep] { | |||
protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep) | |||
for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] { | |||
protectedEpochs[epoch] = struct{}{} | |||
} | |||
@@ -668,7 +797,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { | |||
s.eligibleForRemoval = newEligible | |||
s.rootLock.Unlock() | |||
if len(epochsToRemove) <= 0 { | |||
if len(epochsToRemove) == 0 { | |||
return 0, nil | |||
} | |||
@@ -1,110 +0,0 @@ | |||
// Copyright (c) 2017 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package scorch | |||
import ( | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/index" | |||
) | |||
type Reader struct { | |||
root *IndexSnapshot // Owns 1 ref-count on the index snapshot. | |||
} | |||
func (r *Reader) TermFieldReader(term []byte, field string, includeFreq, | |||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { | |||
return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors) | |||
} | |||
// DocIDReader returns an iterator over all doc ids | |||
// The caller must close returned instance to release associated resources. | |||
func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) { | |||
return r.root.DocIDReaderAll() | |||
} | |||
func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { | |||
return r.root.DocIDReaderOnly(ids) | |||
} | |||
func (r *Reader) FieldDict(field string) (index.FieldDict, error) { | |||
return r.root.FieldDict(field) | |||
} | |||
// FieldDictRange is currently defined to include the start and end terms | |||
func (r *Reader) FieldDictRange(field string, startTerm []byte, | |||
endTerm []byte) (index.FieldDict, error) { | |||
return r.root.FieldDictRange(field, startTerm, endTerm) | |||
} | |||
func (r *Reader) FieldDictPrefix(field string, | |||
termPrefix []byte) (index.FieldDict, error) { | |||
return r.root.FieldDictPrefix(field, termPrefix) | |||
} | |||
func (r *Reader) Document(id string) (*document.Document, error) { | |||
return r.root.Document(id) | |||
} | |||
func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, | |||
visitor index.DocumentFieldTermVisitor) error { | |||
return r.root.DocumentVisitFieldTerms(id, fields, visitor) | |||
} | |||
func (r *Reader) Fields() ([]string, error) { | |||
return r.root.Fields() | |||
} | |||
func (r *Reader) GetInternal(key []byte) ([]byte, error) { | |||
return r.root.GetInternal(key) | |||
} | |||
func (r *Reader) DocCount() (uint64, error) { | |||
return r.root.DocCount() | |||
} | |||
func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) { | |||
return r.root.ExternalID(id) | |||
} | |||
func (r *Reader) InternalID(id string) (index.IndexInternalID, error) { | |||
return r.root.InternalID(id) | |||
} | |||
func (r *Reader) DumpAll() chan interface{} { | |||
rv := make(chan interface{}) | |||
go func() { | |||
close(rv) | |||
}() | |||
return rv | |||
} | |||
func (r *Reader) DumpDoc(id string) chan interface{} { | |||
rv := make(chan interface{}) | |||
go func() { | |||
close(rv) | |||
}() | |||
return rv | |||
} | |||
func (r *Reader) DumpFields() chan interface{} { | |||
rv := make(chan interface{}) | |||
go func() { | |||
close(rv) | |||
}() | |||
return rv | |||
} | |||
func (r *Reader) Close() error { | |||
return r.root.DecRef() | |||
} |
@@ -17,6 +17,7 @@ package scorch | |||
import ( | |||
"encoding/json" | |||
"fmt" | |||
"io/ioutil" | |||
"os" | |||
"sync" | |||
"sync/atomic" | |||
@@ -27,23 +28,24 @@ import ( | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/blevesearch/bleve/index/scorch/segment/mem" | |||
"github.com/blevesearch/bleve/index/scorch/segment/zap" | |||
"github.com/blevesearch/bleve/index/store" | |||
"github.com/blevesearch/bleve/registry" | |||
"github.com/boltdb/bolt" | |||
bolt "github.com/etcd-io/bbolt" | |||
) | |||
const Name = "scorch" | |||
const Version uint8 = 1 | |||
const Version uint8 = 2 | |||
var ErrClosed = fmt.Errorf("scorch closed") | |||
type Scorch struct { | |||
readOnly bool | |||
version uint8 | |||
config map[string]interface{} | |||
analysisQueue *index.AnalysisQueue | |||
stats *Stats | |||
stats Stats | |||
nextSegmentID uint64 | |||
path string | |||
@@ -52,12 +54,15 @@ type Scorch struct { | |||
rootLock sync.RWMutex | |||
root *IndexSnapshot // holds 1 ref-count on the root | |||
rootPersisted []chan error // closed when root is persisted | |||
persistedCallbacks []index.BatchCallback | |||
nextSnapshotEpoch uint64 | |||
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. | |||
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. | |||
numSnapshotsToKeep int | |||
closeCh chan struct{} | |||
introductions chan *segmentIntroduction | |||
persists chan *persistIntroduction | |||
merges chan *segmentMerge | |||
introducerNotifier chan *epochWatcher | |||
revertToSnapshots chan *snapshotReversion | |||
@@ -67,6 +72,23 @@ type Scorch struct { | |||
onEvent func(event Event) | |||
onAsyncError func(err error) | |||
iStats internalStats | |||
pauseLock sync.RWMutex | |||
pauseCount uint64 | |||
} | |||
type internalStats struct { | |||
persistEpoch uint64 | |||
persistSnapshotSize uint64 | |||
mergeEpoch uint64 | |||
mergeSnapshotSize uint64 | |||
newSegBufBytesAdded uint64 | |||
newSegBufBytesRemoved uint64 | |||
analysisBytesAdded uint64 | |||
analysisBytesRemoved uint64 | |||
} | |||
func NewScorch(storeName string, | |||
@@ -80,8 +102,7 @@ func NewScorch(storeName string, | |||
closeCh: make(chan struct{}), | |||
ineligibleForRemoval: map[string]bool{}, | |||
} | |||
rv.stats = &Stats{i: rv} | |||
rv.root = &IndexSnapshot{parent: rv, refs: 1} | |||
rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"} | |||
ro, ok := config["read_only"].(bool) | |||
if ok { | |||
rv.readOnly = ro | |||
@@ -101,9 +122,30 @@ func NewScorch(storeName string, | |||
return rv, nil | |||
} | |||
func (s *Scorch) paused() uint64 { | |||
s.pauseLock.Lock() | |||
pc := s.pauseCount | |||
s.pauseLock.Unlock() | |||
return pc | |||
} | |||
func (s *Scorch) incrPause() { | |||
s.pauseLock.Lock() | |||
s.pauseCount++ | |||
s.pauseLock.Unlock() | |||
} | |||
func (s *Scorch) decrPause() { | |||
s.pauseLock.Lock() | |||
s.pauseCount-- | |||
s.pauseLock.Unlock() | |||
} | |||
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { | |||
if s.onEvent != nil { | |||
s.incrPause() | |||
s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur}) | |||
s.decrPause() | |||
} | |||
} | |||
@@ -111,6 +153,7 @@ func (s *Scorch) fireAsyncError(err error) { | |||
if s.onAsyncError != nil { | |||
s.onAsyncError(err) | |||
} | |||
atomic.AddUint64(&s.stats.TotOnErrors, 1) | |||
} | |||
func (s *Scorch) Open() error { | |||
@@ -172,7 +215,10 @@ func (s *Scorch) openBolt() error { | |||
} | |||
} | |||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, uint64(len(s.root.segment))) | |||
s.introductions = make(chan *segmentIntroduction) | |||
s.persists = make(chan *persistIntroduction) | |||
s.merges = make(chan *segmentMerge) | |||
s.introducerNotifier = make(chan *epochWatcher, 1) | |||
s.revertToSnapshots = make(chan *snapshotReversion) | |||
@@ -186,6 +232,17 @@ func (s *Scorch) openBolt() error { | |||
} | |||
} | |||
s.numSnapshotsToKeep = NumSnapshotsToKeep | |||
if v, ok := s.config["numSnapshotsToKeep"]; ok { | |||
var t int | |||
if t, err = parseToInteger(v); err != nil { | |||
return fmt.Errorf("numSnapshotsToKeep parse err: %v", err) | |||
} | |||
if t > 0 { | |||
s.numSnapshotsToKeep = t | |||
} | |||
} | |||
return nil | |||
} | |||
@@ -255,65 +312,83 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { | |||
// FIXME could sort ids list concurrent with analysis? | |||
go func() { | |||
for _, doc := range batch.IndexOps { | |||
if doc != nil { | |||
aw := index.NewAnalysisWork(s, doc, resultChan) | |||
// put the work on the queue | |||
s.analysisQueue.Queue(aw) | |||
if len(batch.IndexOps) > 0 { | |||
go func() { | |||
for _, doc := range batch.IndexOps { | |||
if doc != nil { | |||
aw := index.NewAnalysisWork(s, doc, resultChan) | |||
// put the work on the queue | |||
s.analysisQueue.Queue(aw) | |||
} | |||
} | |||
} | |||
}() | |||
}() | |||
} | |||
// wait for analysis result | |||
analysisResults := make([]*index.AnalysisResult, int(numUpdates)) | |||
var itemsDeQueued uint64 | |||
var totalAnalysisSize int | |||
for itemsDeQueued < numUpdates { | |||
result := <-resultChan | |||
resultSize := result.Size() | |||
atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize)) | |||
totalAnalysisSize += resultSize | |||
analysisResults[itemsDeQueued] = result | |||
itemsDeQueued++ | |||
} | |||
close(resultChan) | |||
defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize)) | |||
atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start))) | |||
atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(start))) | |||
indexStart := time.Now() | |||
// notify handlers that we're about to introduce a segment | |||
s.fireEvent(EventKindBatchIntroductionStart, 0) | |||
var newSegment segment.Segment | |||
var bufBytes uint64 | |||
if len(analysisResults) > 0 { | |||
newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor) | |||
newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor) | |||
if err != nil { | |||
return err | |||
} | |||
atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes) | |||
} else { | |||
atomic.AddUint64(&s.stats.TotBatchesEmpty, 1) | |||
} | |||
err = s.prepareSegment(newSegment, ids, batch.InternalOps) | |||
err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback()) | |||
if err != nil { | |||
if newSegment != nil { | |||
_ = newSegment.Close() | |||
} | |||
atomic.AddUint64(&s.stats.errors, 1) | |||
atomic.AddUint64(&s.stats.TotOnErrors, 1) | |||
} else { | |||
atomic.AddUint64(&s.stats.updates, numUpdates) | |||
atomic.AddUint64(&s.stats.deletes, numDeletes) | |||
atomic.AddUint64(&s.stats.batches, 1) | |||
atomic.AddUint64(&s.stats.numPlainTextBytesIndexed, numPlainTextBytes) | |||
atomic.AddUint64(&s.stats.TotUpdates, numUpdates) | |||
atomic.AddUint64(&s.stats.TotDeletes, numDeletes) | |||
atomic.AddUint64(&s.stats.TotBatches, 1) | |||
atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes) | |||
} | |||
atomic.AddUint64(&s.iStats.newSegBufBytesRemoved, bufBytes) | |||
atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart))) | |||
return err | |||
} | |||
func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, | |||
internalOps map[string][]byte) error { | |||
internalOps map[string][]byte, persistedCallback index.BatchCallback) error { | |||
// new introduction | |||
introduction := &segmentIntroduction{ | |||
id: atomic.AddUint64(&s.nextSegmentID, 1), | |||
data: newSegment, | |||
ids: ids, | |||
obsoletes: make(map[uint64]*roaring.Bitmap), | |||
internal: internalOps, | |||
applied: make(chan error), | |||
id: atomic.AddUint64(&s.nextSegmentID, 1), | |||
data: newSegment, | |||
ids: ids, | |||
obsoletes: make(map[uint64]*roaring.Bitmap), | |||
internal: internalOps, | |||
applied: make(chan error), | |||
persistedCallback: persistedCallback, | |||
} | |||
if !s.unsafeBatch { | |||
@@ -326,6 +401,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, | |||
root.AddRef() | |||
s.rootLock.RUnlock() | |||
defer func() { _ = root.DecRef() }() | |||
for _, seg := range root.segment { | |||
delta, err := seg.segment.DocNumbers(ids) | |||
if err != nil { | |||
@@ -334,7 +411,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, | |||
introduction.obsoletes[seg.id] = delta | |||
} | |||
_ = root.DecRef() | |||
introStartTime := time.Now() | |||
s.introductions <- introduction | |||
@@ -348,6 +425,12 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, | |||
err = <-introduction.persisted | |||
} | |||
introTime := uint64(time.Since(introStartTime)) | |||
atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime) | |||
if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime { | |||
atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime) | |||
} | |||
return err | |||
} | |||
@@ -366,18 +449,69 @@ func (s *Scorch) DeleteInternal(key []byte) error { | |||
// Reader returns a low-level accessor on the index data. Close it to | |||
// release associated resources. | |||
func (s *Scorch) Reader() (index.IndexReader, error) { | |||
return s.currentSnapshot(), nil | |||
} | |||
func (s *Scorch) currentSnapshot() *IndexSnapshot { | |||
s.rootLock.RLock() | |||
rv := &Reader{root: s.root} | |||
rv.root.AddRef() | |||
rv := s.root | |||
if rv != nil { | |||
rv.AddRef() | |||
} | |||
s.rootLock.RUnlock() | |||
return rv, nil | |||
return rv | |||
} | |||
func (s *Scorch) Stats() json.Marshaler { | |||
return s.stats | |||
return &s.stats | |||
} | |||
func (s *Scorch) diskFileStats() (uint64, uint64) { | |||
var numFilesOnDisk, numBytesUsedDisk uint64 | |||
if s.path != "" { | |||
finfos, err := ioutil.ReadDir(s.path) | |||
if err == nil { | |||
for _, finfo := range finfos { | |||
if !finfo.IsDir() { | |||
numBytesUsedDisk += uint64(finfo.Size()) | |||
numFilesOnDisk++ | |||
} | |||
} | |||
} | |||
} | |||
return numFilesOnDisk, numBytesUsedDisk | |||
} | |||
func (s *Scorch) StatsMap() map[string]interface{} { | |||
m, _ := s.stats.statsMap() | |||
m := s.stats.ToMap() | |||
numFilesOnDisk, numBytesUsedDisk := s.diskFileStats() | |||
m["CurOnDiskBytes"] = numBytesUsedDisk | |||
m["CurOnDiskFiles"] = numFilesOnDisk | |||
// TODO: consider one day removing these backwards compatible | |||
// names for apps using the old names | |||
m["updates"] = m["TotUpdates"] | |||
m["deletes"] = m["TotDeletes"] | |||
m["batches"] = m["TotBatches"] | |||
m["errors"] = m["TotOnErrors"] | |||
m["analysis_time"] = m["TotAnalysisTime"] | |||
m["index_time"] = m["TotIndexTime"] | |||
m["term_searchers_started"] = m["TotTermSearchersStarted"] | |||
m["term_searchers_finished"] = m["TotTermSearchersFinished"] | |||
m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"] | |||
m["num_items_introduced"] = m["TotIntroducedItems"] | |||
m["num_items_persisted"] = m["TotPersistedItems"] | |||
m["num_recs_to_persist"] = m["TotItemsToPersist"] | |||
m["num_bytes_used_disk"] = m["CurOnDiskBytes"] | |||
m["num_files_on_disk"] = m["CurOnDiskFiles"] | |||
m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"] | |||
m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"] | |||
m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"] | |||
m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"] | |||
m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"] | |||
return m | |||
} | |||
@@ -394,7 +528,7 @@ func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult { | |||
rv.Analyzed[i] = tokenFreqs | |||
rv.Length[i] = fieldLength | |||
if len(d.CompositeFields) > 0 { | |||
if len(d.CompositeFields) > 0 && field.Name() != "_id" { | |||
// see if any of the composite fields need this | |||
for _, compositeField := range d.CompositeFields { | |||
compositeField.Compose(field.Name(), fieldLength, tokenFreqs) | |||
@@ -418,20 +552,43 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { | |||
s.rootLock.Unlock() | |||
} | |||
func (s *Scorch) MemoryUsed() uint64 { | |||
var memUsed uint64 | |||
s.rootLock.RLock() | |||
if s.root != nil { | |||
for _, segmentSnapshot := range s.root.segment { | |||
memUsed += 8 /* size of id -> uint64 */ + | |||
segmentSnapshot.segment.SizeInBytes() | |||
if segmentSnapshot.deleted != nil { | |||
memUsed += segmentSnapshot.deleted.GetSizeInBytes() | |||
} | |||
memUsed += segmentSnapshot.cachedDocs.sizeInBytes() | |||
} | |||
func (s *Scorch) MemoryUsed() (memUsed uint64) { | |||
indexSnapshot := s.currentSnapshot() | |||
if indexSnapshot == nil { | |||
return | |||
} | |||
s.rootLock.RUnlock() | |||
defer func() { | |||
_ = indexSnapshot.Close() | |||
}() | |||
// Account for current root snapshot overhead | |||
memUsed += uint64(indexSnapshot.Size()) | |||
// Account for snapshot that the persister may be working on | |||
persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch) | |||
persistSnapshotSize := atomic.LoadUint64(&s.iStats.persistSnapshotSize) | |||
if persistEpoch != 0 && indexSnapshot.epoch > persistEpoch { | |||
// the snapshot that the persister is working on isn't the same as | |||
// the current snapshot | |||
memUsed += persistSnapshotSize | |||
} | |||
// Account for snapshot that the merger may be working on | |||
mergeEpoch := atomic.LoadUint64(&s.iStats.mergeEpoch) | |||
mergeSnapshotSize := atomic.LoadUint64(&s.iStats.mergeSnapshotSize) | |||
if mergeEpoch != 0 && indexSnapshot.epoch > mergeEpoch { | |||
// the snapshot that the merger is working on isn't the same as | |||
// the current snapshot | |||
memUsed += mergeSnapshotSize | |||
} | |||
memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) - | |||
atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved)) | |||
memUsed += (atomic.LoadUint64(&s.iStats.analysisBytesAdded) - | |||
atomic.LoadUint64(&s.iStats.analysisBytesRemoved)) | |||
return memUsed | |||
} | |||
@@ -450,3 +607,15 @@ func (s *Scorch) unmarkIneligibleForRemoval(filename string) { | |||
func init() { | |||
registry.RegisterIndexType(Name, NewScorch) | |||
} | |||
func parseToInteger(i interface{}) (int, error) { | |||
switch v := i.(type) { | |||
case float64: | |||
return int(v), nil | |||
case int: | |||
return v, nil | |||
default: | |||
return 0, fmt.Errorf("expects int or float64 value") | |||
} | |||
} |
@@ -17,6 +17,7 @@ package segment | |||
import ( | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/couchbase/vellum" | |||
) | |||
type EmptySegment struct{} | |||
@@ -29,6 +30,10 @@ func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisit | |||
return nil | |||
} | |||
func (e *EmptySegment) DocID(num uint64) ([]byte, error) { | |||
return nil, nil | |||
} | |||
func (e *EmptySegment) Count() uint64 { | |||
return 0 | |||
} | |||
@@ -46,6 +51,10 @@ func (e *EmptySegment) Close() error { | |||
return nil | |||
} | |||
func (e *EmptySegment) Size() uint64 { | |||
return 0 | |||
} | |||
func (e *EmptySegment) AddRef() { | |||
} | |||
@@ -55,8 +64,8 @@ func (e *EmptySegment) DecRef() error { | |||
type EmptyDictionary struct{} | |||
func (e *EmptyDictionary) PostingsList(term string, | |||
except *roaring.Bitmap) (PostingsList, error) { | |||
func (e *EmptyDictionary) PostingsList(term []byte, | |||
except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) { | |||
return &EmptyPostingsList{}, nil | |||
} | |||
@@ -72,18 +81,37 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { | |||
return &EmptyDictionaryIterator{} | |||
} | |||
func (e *EmptyDictionary) AutomatonIterator(a vellum.Automaton, | |||
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator { | |||
return &EmptyDictionaryIterator{} | |||
} | |||
func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte, | |||
includeCount bool) DictionaryIterator { | |||
return &EmptyDictionaryIterator{} | |||
} | |||
type EmptyDictionaryIterator struct{} | |||
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { | |||
return nil, nil | |||
} | |||
func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { | |||
return nil, nil | |||
} | |||
type EmptyPostingsList struct{} | |||
func (e *EmptyPostingsList) Iterator() PostingsIterator { | |||
func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool, | |||
prealloc PostingsIterator) PostingsIterator { | |||
return &EmptyPostingsIterator{} | |||
} | |||
func (e *EmptyPostingsList) Size() int { | |||
return 0 | |||
} | |||
func (e *EmptyPostingsList) Count() uint64 { | |||
return 0 | |||
} | |||
@@ -93,3 +121,9 @@ type EmptyPostingsIterator struct{} | |||
func (e *EmptyPostingsIterator) Next() (Posting, error) { | |||
return nil, nil | |||
} | |||
func (e *EmptyPostingsIterator) Size() int { | |||
return 0 | |||
} | |||
var AnEmptyPostingsIterator = &EmptyPostingsIterator{} |
@@ -1,321 +0,0 @@ | |||
// Copyright (c) 2017 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package mem | |||
import ( | |||
"math" | |||
"sort" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/index" | |||
) | |||
// NewFromAnalyzedDocs places the analyzed document mutations into a new segment | |||
func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { | |||
s := New() | |||
// ensure that _id field get fieldID 0 | |||
s.getOrDefineField("_id") | |||
// fill Dicts/DictKeys and preallocate memory | |||
s.initializeDict(results) | |||
// walk each doc | |||
for _, result := range results { | |||
s.processDocument(result) | |||
} | |||
// go back and sort the dictKeys | |||
for _, dict := range s.DictKeys { | |||
sort.Strings(dict) | |||
} | |||
// compute memory usage of segment | |||
s.updateSizeInBytes() | |||
// professional debugging | |||
// | |||
// log.Printf("fields: %v\n", s.FieldsMap) | |||
// log.Printf("fieldsInv: %v\n", s.FieldsInv) | |||
// log.Printf("fieldsLoc: %v\n", s.FieldsLoc) | |||
// log.Printf("dicts: %v\n", s.Dicts) | |||
// log.Printf("dict keys: %v\n", s.DictKeys) | |||
// for i, posting := range s.Postings { | |||
// log.Printf("posting %d: %v\n", i, posting) | |||
// } | |||
// for i, freq := range s.Freqs { | |||
// log.Printf("freq %d: %v\n", i, freq) | |||
// } | |||
// for i, norm := range s.Norms { | |||
// log.Printf("norm %d: %v\n", i, norm) | |||
// } | |||
// for i, field := range s.Locfields { | |||
// log.Printf("field %d: %v\n", i, field) | |||
// } | |||
// for i, start := range s.Locstarts { | |||
// log.Printf("start %d: %v\n", i, start) | |||
// } | |||
// for i, end := range s.Locends { | |||
// log.Printf("end %d: %v\n", i, end) | |||
// } | |||
// for i, pos := range s.Locpos { | |||
// log.Printf("pos %d: %v\n", i, pos) | |||
// } | |||
// for i, apos := range s.Locarraypos { | |||
// log.Printf("apos %d: %v\n", i, apos) | |||
// } | |||
// log.Printf("stored: %v\n", s.Stored) | |||
// log.Printf("stored types: %v\n", s.StoredTypes) | |||
// log.Printf("stored pos: %v\n", s.StoredPos) | |||
return s | |||
} | |||
// fill Dicts/DictKeys and preallocate memory for postings | |||
func (s *Segment) initializeDict(results []*index.AnalysisResult) { | |||
var numPostingsLists int | |||
numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. | |||
numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. | |||
var numTokenFrequencies int | |||
var totLocs int | |||
// initial scan for all fieldID's to sort them | |||
for _, result := range results { | |||
for _, field := range result.Document.CompositeFields { | |||
s.getOrDefineField(field.Name()) | |||
} | |||
for _, field := range result.Document.Fields { | |||
s.getOrDefineField(field.Name()) | |||
} | |||
} | |||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field | |||
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) | |||
for fieldID, fieldName := range s.FieldsInv { | |||
s.FieldsMap[fieldName] = uint16(fieldID + 1) | |||
} | |||
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { | |||
for term, tf := range tfs { | |||
pidPlus1, exists := s.Dicts[fieldID][term] | |||
if !exists { | |||
numPostingsLists++ | |||
pidPlus1 = uint64(numPostingsLists) | |||
s.Dicts[fieldID][term] = pidPlus1 | |||
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) | |||
numTermsPerPostingsList = append(numTermsPerPostingsList, 0) | |||
numLocsPerPostingsList = append(numLocsPerPostingsList, 0) | |||
} | |||
pid := pidPlus1 - 1 | |||
numTermsPerPostingsList[pid] += 1 | |||
numLocsPerPostingsList[pid] += len(tf.Locations) | |||
totLocs += len(tf.Locations) | |||
} | |||
numTokenFrequencies += len(tfs) | |||
} | |||
for _, result := range results { | |||
// walk each composite field | |||
for _, field := range result.Document.CompositeFields { | |||
fieldID := uint16(s.getOrDefineField(field.Name())) | |||
_, tf := field.Analyze() | |||
processField(fieldID, tf) | |||
} | |||
// walk each field | |||
for i, field := range result.Document.Fields { | |||
fieldID := uint16(s.getOrDefineField(field.Name())) | |||
tf := result.Analyzed[i] | |||
processField(fieldID, tf) | |||
} | |||
} | |||
s.Postings = make([]*roaring.Bitmap, numPostingsLists) | |||
for i := 0; i < numPostingsLists; i++ { | |||
s.Postings[i] = roaring.New() | |||
} | |||
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) | |||
for i := 0; i < numPostingsLists; i++ { | |||
s.PostingsLocs[i] = roaring.New() | |||
} | |||
// Preallocate big, contiguous backing arrays. | |||
auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos. | |||
uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos. | |||
float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms. | |||
uint16Backing := make([]uint16, totLocs) // For sub-Locfields. | |||
// Point top-level slices to the backing arrays. | |||
s.Freqs = auint64Backing[0:numPostingsLists] | |||
auint64Backing = auint64Backing[numPostingsLists:] | |||
s.Norms = make([][]float32, numPostingsLists) | |||
s.Locfields = make([][]uint16, numPostingsLists) | |||
s.Locstarts = auint64Backing[0:numPostingsLists] | |||
auint64Backing = auint64Backing[numPostingsLists:] | |||
s.Locends = auint64Backing[0:numPostingsLists] | |||
auint64Backing = auint64Backing[numPostingsLists:] | |||
s.Locpos = auint64Backing[0:numPostingsLists] | |||
auint64Backing = auint64Backing[numPostingsLists:] | |||
s.Locarraypos = make([][][]uint64, numPostingsLists) | |||
// Point sub-slices to the backing arrays. | |||
for pid, numTerms := range numTermsPerPostingsList { | |||
s.Freqs[pid] = uint64Backing[0:0] | |||
uint64Backing = uint64Backing[numTerms:] | |||
s.Norms[pid] = float32Backing[0:0] | |||
float32Backing = float32Backing[numTerms:] | |||
} | |||
for pid, numLocs := range numLocsPerPostingsList { | |||
s.Locfields[pid] = uint16Backing[0:0] | |||
uint16Backing = uint16Backing[numLocs:] | |||
s.Locstarts[pid] = uint64Backing[0:0] | |||
uint64Backing = uint64Backing[numLocs:] | |||
s.Locends[pid] = uint64Backing[0:0] | |||
uint64Backing = uint64Backing[numLocs:] | |||
s.Locpos[pid] = uint64Backing[0:0] | |||
uint64Backing = uint64Backing[numLocs:] | |||
s.Locarraypos[pid] = auint64Backing[0:0] | |||
auint64Backing = auint64Backing[numLocs:] | |||
} | |||
} | |||
func (s *Segment) processDocument(result *index.AnalysisResult) { | |||
// used to collate information across fields | |||
docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap)) | |||
fieldLens := make(map[uint16]int, len(s.FieldsMap)) | |||
docNum := uint64(s.addDocument()) | |||
processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { | |||
fieldLens[field] += l | |||
if existingFreqs, ok := docMap[field]; ok { | |||
existingFreqs.MergeAll(name, tf) | |||
} else { | |||
docMap[field] = tf | |||
} | |||
} | |||
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) { | |||
s.Stored[docNum][field] = append(s.Stored[docNum][field], val) | |||
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ) | |||
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos) | |||
} | |||
// walk each composite field | |||
for _, field := range result.Document.CompositeFields { | |||
fieldID := uint16(s.getOrDefineField(field.Name())) | |||
l, tf := field.Analyze() | |||
processField(fieldID, field.Name(), l, tf) | |||
} | |||
// walk each field | |||
for i, field := range result.Document.Fields { | |||
fieldID := uint16(s.getOrDefineField(field.Name())) | |||
l := result.Length[i] | |||
tf := result.Analyzed[i] | |||
processField(fieldID, field.Name(), l, tf) | |||
if field.Options().IsStored() { | |||
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) | |||
} | |||
if field.Options().IncludeDocValues() { | |||
s.DocValueFields[fieldID] = true | |||
} | |||
} | |||
// now that its been rolled up into docMap, walk that | |||
for fieldID, tokenFrequencies := range docMap { | |||
for term, tokenFreq := range tokenFrequencies { | |||
pid := s.Dicts[fieldID][term] - 1 | |||
bs := s.Postings[pid] | |||
bs.AddInt(int(docNum)) | |||
s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) | |||
s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) | |||
locationBS := s.PostingsLocs[pid] | |||
if len(tokenFreq.Locations) > 0 { | |||
locationBS.AddInt(int(docNum)) | |||
for _, loc := range tokenFreq.Locations { | |||
var locf = fieldID | |||
if loc.Field != "" { | |||
locf = uint16(s.getOrDefineField(loc.Field)) | |||
} | |||
s.Locfields[pid] = append(s.Locfields[pid], locf) | |||
s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start)) | |||
s.Locends[pid] = append(s.Locends[pid], uint64(loc.End)) | |||
s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position)) | |||
if len(loc.ArrayPositions) > 0 { | |||
s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions) | |||
} else { | |||
s.Locarraypos[pid] = append(s.Locarraypos[pid], nil) | |||
} | |||
} | |||
} | |||
} | |||
} | |||
} | |||
func (s *Segment) getOrDefineField(name string) int { | |||
fieldIDPlus1, ok := s.FieldsMap[name] | |||
if !ok { | |||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) | |||
s.FieldsMap[name] = fieldIDPlus1 | |||
s.FieldsInv = append(s.FieldsInv, name) | |||
s.Dicts = append(s.Dicts, make(map[string]uint64)) | |||
s.DictKeys = append(s.DictKeys, make([]string, 0)) | |||
} | |||
return int(fieldIDPlus1 - 1) | |||
} | |||
func (s *Segment) addDocument() int { | |||
docNum := len(s.Stored) | |||
s.Stored = append(s.Stored, map[uint16][][]byte{}) | |||
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{}) | |||
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{}) | |||
return docNum | |||
} | |||
func encodeFieldType(f document.Field) byte { | |||
fieldType := byte('x') | |||
switch f.(type) { | |||
case *document.TextField: | |||
fieldType = 't' | |||
case *document.NumericField: | |||
fieldType = 'n' | |||
case *document.DateTimeField: | |||
fieldType = 'd' | |||
case *document.BooleanField: | |||
fieldType = 'b' | |||
case *document.GeoPointField: | |||
fieldType = 'g' | |||
case *document.CompositeField: | |||
fieldType = 'c' | |||
} | |||
return fieldType | |||
} |
@@ -1,103 +0,0 @@ | |||
// Copyright (c) 2017 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package mem | |||
import ( | |||
"sort" | |||
"strings" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
) | |||
// Dictionary is the in-memory representation of the term dictionary | |||
type Dictionary struct { | |||
segment *Segment | |||
field string | |||
fieldID uint16 | |||
} | |||
// PostingsList returns the postings list for the specified term | |||
func (d *Dictionary) PostingsList(term string, | |||
except *roaring.Bitmap) (segment.PostingsList, error) { | |||
return &PostingsList{ | |||
dictionary: d, | |||
term: term, | |||
postingsID: d.segment.Dicts[d.fieldID][term], | |||
except: except, | |||
}, nil | |||
} | |||
// Iterator returns an iterator for this dictionary | |||
func (d *Dictionary) Iterator() segment.DictionaryIterator { | |||
return &DictionaryIterator{ | |||
d: d, | |||
} | |||
} | |||
// PrefixIterator returns an iterator which only visits terms having the | |||
// the specified prefix | |||
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { | |||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix) | |||
return &DictionaryIterator{ | |||
d: d, | |||
prefix: prefix, | |||
offset: offset, | |||
} | |||
} | |||
// RangeIterator returns an iterator which only visits terms between the | |||
// start and end terms. NOTE: bleve.index API specifies the end is inclusive. | |||
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { | |||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start) | |||
return &DictionaryIterator{ | |||
d: d, | |||
offset: offset, | |||
end: end, | |||
} | |||
} | |||
// DictionaryIterator is an iterator for term dictionary | |||
type DictionaryIterator struct { | |||
d *Dictionary | |||
prefix string | |||
end string | |||
offset int | |||
dictEntry index.DictEntry // reused across Next()'s | |||
} | |||
// Next returns the next entry in the dictionary | |||
func (d *DictionaryIterator) Next() (*index.DictEntry, error) { | |||
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { | |||
return nil, nil | |||
} | |||
next := d.d.segment.DictKeys[d.d.fieldID][d.offset] | |||
// check prefix | |||
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) { | |||
return nil, nil | |||
} | |||
// check end (bleve.index API demands inclusive end) | |||
if d.end != "" && next > d.end { | |||
return nil, nil | |||
} | |||
d.offset++ | |||
postingID := d.d.segment.Dicts[d.d.fieldID][next] | |||
d.dictEntry.Term = next | |||
d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality() | |||
return &d.dictEntry, nil | |||
} |
@@ -1,178 +0,0 @@ | |||
// Copyright (c) 2017 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package mem | |||
import ( | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
) | |||
// PostingsList is an in-memory represenation of a postings list | |||
type PostingsList struct { | |||
dictionary *Dictionary | |||
term string | |||
postingsID uint64 | |||
except *roaring.Bitmap | |||
} | |||
// Count returns the number of items on this postings list | |||
func (p *PostingsList) Count() uint64 { | |||
var rv uint64 | |||
if p.postingsID > 0 { | |||
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality() | |||
if p.except != nil { | |||
except := p.except.GetCardinality() | |||
if except > rv { | |||
// avoid underflow | |||
except = rv | |||
} | |||
rv -= except | |||
} | |||
} | |||
return rv | |||
} | |||
// Iterator returns an iterator for this postings list | |||
func (p *PostingsList) Iterator() segment.PostingsIterator { | |||
rv := &PostingsIterator{ | |||
postings: p, | |||
} | |||
if p.postingsID > 0 { | |||
allbits := p.dictionary.segment.Postings[p.postingsID-1] | |||
rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1] | |||
rv.all = allbits.Iterator() | |||
if p.except != nil { | |||
allExcept := allbits.Clone() | |||
allExcept.AndNot(p.except) | |||
rv.actual = allExcept.Iterator() | |||
} else { | |||
rv.actual = allbits.Iterator() | |||
} | |||
} | |||
return rv | |||
} | |||
// PostingsIterator provides a way to iterate through the postings list | |||
type PostingsIterator struct { | |||
postings *PostingsList | |||
all roaring.IntIterable | |||
locations *roaring.Bitmap | |||
offset int | |||
locoffset int | |||
actual roaring.IntIterable | |||
} | |||
// Next returns the next posting on the postings list, or nil at the end | |||
func (i *PostingsIterator) Next() (segment.Posting, error) { | |||
if i.actual == nil || !i.actual.HasNext() { | |||
return nil, nil | |||
} | |||
n := i.actual.Next() | |||
allN := i.all.Next() | |||
// n is the next actual hit (excluding some postings) | |||
// allN is the next hit in the full postings | |||
// if they don't match, adjust offsets to factor in item we're skipping over | |||
// incr the all iterator, and check again | |||
for allN != n { | |||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) | |||
i.offset++ | |||
allN = i.all.Next() | |||
} | |||
rv := &Posting{ | |||
iterator: i, | |||
docNum: uint64(n), | |||
offset: i.offset, | |||
locoffset: i.locoffset, | |||
hasLoc: i.locations.Contains(n), | |||
} | |||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) | |||
i.offset++ | |||
return rv, nil | |||
} | |||
// Posting is a single entry in a postings list | |||
type Posting struct { | |||
iterator *PostingsIterator | |||
docNum uint64 | |||
offset int | |||
locoffset int | |||
hasLoc bool | |||
} | |||
// Number returns the document number of this posting in this segment | |||
func (p *Posting) Number() uint64 { | |||
return p.docNum | |||
} | |||
// Frequency returns the frequence of occurance of this term in this doc/field | |||
func (p *Posting) Frequency() uint64 { | |||
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset] | |||
} | |||
// Norm returns the normalization factor for this posting | |||
func (p *Posting) Norm() float64 { | |||
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset]) | |||
} | |||
// Locations returns the location information for each occurance | |||
func (p *Posting) Locations() []segment.Location { | |||
if !p.hasLoc { | |||
return nil | |||
} | |||
freq := int(p.Frequency()) | |||
rv := make([]segment.Location, freq) | |||
for i := 0; i < freq; i++ { | |||
rv[i] = &Location{ | |||
p: p, | |||
offset: p.locoffset + i, | |||
} | |||
} | |||
return rv | |||
} | |||
// Location represents the location of a single occurance | |||
type Location struct { | |||
p *Posting | |||
offset int | |||
} | |||
// Field returns the name of the field (useful in composite fields to know | |||
// which original field the value came from) | |||
func (l *Location) Field() string { | |||
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]] | |||
} | |||
// Start returns the start byte offset of this occurance | |||
func (l *Location) Start() uint64 { | |||
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset] | |||
} | |||
// End returns the end byte offset of this occurance | |||
func (l *Location) End() uint64 { | |||
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset] | |||
} | |||
// Pos returns the 1-based phrase position of this occurance | |||
func (l *Location) Pos() uint64 { | |||
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset] | |||
} | |||
// ArrayPositions returns the array position vector associated with this occurance | |||
func (l *Location) ArrayPositions() []uint64 { | |||
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset] | |||
} |
@@ -1,289 +0,0 @@ | |||
// Copyright (c) 2017 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package mem | |||
import ( | |||
"fmt" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
) | |||
// _id field is always guaranteed to have fieldID of 0 | |||
const idFieldID uint16 = 0 | |||
// KNOWN ISSUES | |||
// - LIMITATION - we decided whether or not to store term vectors for a field | |||
// at the segment level, based on the first definition of a | |||
// field we see. in normal bleve usage this is fine, all | |||
// instances of a field definition will be the same. however, | |||
// advanced users may violate this and provide unique field | |||
// definitions with each document. this segment does not | |||
// support this usage. | |||
// TODO | |||
// - need better testing of multiple docs, iterating freqs, locations and | |||
// and verifying the correct results are returned | |||
// Segment is an in memory implementation of scorch.Segment | |||
type Segment struct { | |||
// FieldsMap adds 1 to field id to avoid zero value issues | |||
// name -> field id + 1 | |||
FieldsMap map[string]uint16 | |||
// FieldsInv is the inverse of FieldsMap | |||
// field id -> name | |||
FieldsInv []string | |||
// Term dictionaries for each field | |||
// field id -> term -> postings list id + 1 | |||
Dicts []map[string]uint64 | |||
// Terms for each field, where terms are sorted ascending | |||
// field id -> []term | |||
DictKeys [][]string | |||
// Postings list | |||
// postings list id -> bitmap by docNum | |||
Postings []*roaring.Bitmap | |||
// Postings list has locations | |||
PostingsLocs []*roaring.Bitmap | |||
// Term frequencies | |||
// postings list id -> Freqs (one for each hit in bitmap) | |||
Freqs [][]uint64 | |||
// Field norms | |||
// postings list id -> Norms (one for each hit in bitmap) | |||
Norms [][]float32 | |||
// Field/start/end/pos/locarraypos | |||
// postings list id -> start/end/pos/locarraypos (one for each freq) | |||
Locfields [][]uint16 | |||
Locstarts [][]uint64 | |||
Locends [][]uint64 | |||
Locpos [][]uint64 | |||
Locarraypos [][][]uint64 | |||
// Stored field values | |||
// docNum -> field id -> slice of values (each value []byte) | |||
Stored []map[uint16][][]byte | |||
// Stored field types | |||
// docNum -> field id -> slice of types (each type byte) | |||
StoredTypes []map[uint16][]byte | |||
// Stored field array positions | |||
// docNum -> field id -> slice of array positions (each is []uint64) | |||
StoredPos []map[uint16][][]uint64 | |||
// For storing the docValue persisted fields | |||
DocValueFields map[uint16]bool | |||
// Footprint of the segment, updated when analyzed document mutations | |||
// are added into the segment | |||
sizeInBytes uint64 | |||
} | |||
// New builds a new empty Segment | |||
func New() *Segment { | |||
return &Segment{ | |||
FieldsMap: map[string]uint16{}, | |||
DocValueFields: map[uint16]bool{}, | |||
} | |||
} | |||
func (s *Segment) updateSizeInBytes() { | |||
var sizeInBytes uint64 | |||
// FieldsMap, FieldsInv | |||
for k, _ := range s.FieldsMap { | |||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + | |||
2 /* size of uint16 */) | |||
} | |||
// overhead from the data structures | |||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) | |||
// Dicts, DictKeys | |||
for _, entry := range s.Dicts { | |||
for k, _ := range entry { | |||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + | |||
8 /* size of uint64 */) | |||
} | |||
// overhead from the data structures | |||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) | |||
} | |||
sizeInBytes += (segment.SizeOfSlice * 2) | |||
// Postings, PostingsLocs | |||
for i := 0; i < len(s.Postings); i++ { | |||
sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) + | |||
(s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer) | |||
} | |||
sizeInBytes += (segment.SizeOfSlice * 2) | |||
// Freqs, Norms | |||
for i := 0; i < len(s.Freqs); i++ { | |||
sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ + | |||
len(s.Norms[i])*4 /* size of float32 */) + | |||
(segment.SizeOfSlice * 2) | |||
} | |||
sizeInBytes += (segment.SizeOfSlice * 2) | |||
// Location data | |||
for i := 0; i < len(s.Locfields); i++ { | |||
sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ + | |||
len(s.Locstarts[i])*8 /* size of uint64 */ + | |||
len(s.Locends[i])*8 /* size of uint64 */ + | |||
len(s.Locpos[i])*8 /* size of uint64 */) | |||
for j := 0; j < len(s.Locarraypos[i]); j++ { | |||
sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) + | |||
segment.SizeOfSlice | |||
} | |||
sizeInBytes += (segment.SizeOfSlice * 5) | |||
} | |||
sizeInBytes += (segment.SizeOfSlice * 5) | |||
// Stored data | |||
for i := 0; i < len(s.Stored); i++ { | |||
for _, v := range s.Stored[i] { | |||
sizeInBytes += uint64(2 /* size of uint16 */) | |||
for _, arr := range v { | |||
sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice | |||
} | |||
sizeInBytes += segment.SizeOfSlice | |||
} | |||
for _, v := range s.StoredTypes[i] { | |||
sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice | |||
} | |||
for _, v := range s.StoredPos[i] { | |||
sizeInBytes += uint64(2 /* size of uint16 */) | |||
for _, arr := range v { | |||
sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) + | |||
segment.SizeOfSlice | |||
} | |||
sizeInBytes += segment.SizeOfSlice | |||
} | |||
// overhead from map(s) within Stored, StoredTypes, StoredPos | |||
sizeInBytes += (segment.SizeOfMap * 3) | |||
} | |||
// overhead from data structures: Stored, StoredTypes, StoredPos | |||
sizeInBytes += (segment.SizeOfSlice * 3) | |||
// DocValueFields | |||
sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) + | |||
segment.SizeOfMap | |||
// SizeInBytes | |||
sizeInBytes += uint64(8) | |||
s.sizeInBytes = sizeInBytes | |||
} | |||
func (s *Segment) SizeInBytes() uint64 { | |||
return s.sizeInBytes | |||
} | |||
func (s *Segment) AddRef() { | |||
} | |||
func (s *Segment) DecRef() error { | |||
return nil | |||
} | |||
// Fields returns the field names used in this segment | |||
func (s *Segment) Fields() []string { | |||
return s.FieldsInv | |||
} | |||
// VisitDocument invokes the DocFieldValueVistor for each stored field | |||
// for the specified doc number | |||
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { | |||
// ensure document number exists | |||
if int(num) > len(s.Stored)-1 { | |||
return nil | |||
} | |||
docFields := s.Stored[int(num)] | |||
st := s.StoredTypes[int(num)] | |||
sp := s.StoredPos[int(num)] | |||
for field, values := range docFields { | |||
for i, value := range values { | |||
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i]) | |||
if !keepGoing { | |||
return nil | |||
} | |||
} | |||
} | |||
return nil | |||
} | |||
func (s *Segment) getField(name string) (int, error) { | |||
fieldID, ok := s.FieldsMap[name] | |||
if !ok { | |||
return 0, fmt.Errorf("no field named %s", name) | |||
} | |||
return int(fieldID - 1), nil | |||
} | |||
// Dictionary returns the term dictionary for the specified field | |||
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { | |||
fieldID, err := s.getField(field) | |||
if err != nil { | |||
// no such field, return empty dictionary | |||
return &segment.EmptyDictionary{}, nil | |||
} | |||
return &Dictionary{ | |||
segment: s, | |||
field: field, | |||
fieldID: uint16(fieldID), | |||
}, nil | |||
} | |||
// Count returns the number of documents in this segment | |||
// (this has no notion of deleted docs) | |||
func (s *Segment) Count() uint64 { | |||
return uint64(len(s.Stored)) | |||
} | |||
// DocNumbers returns a bitset corresponding to the doc numbers of all the | |||
// provided _id strings | |||
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { | |||
rv := roaring.New() | |||
// guard against empty segment | |||
if len(s.FieldsMap) > 0 { | |||
idDictionary := s.Dicts[idFieldID] | |||
for _, id := range ids { | |||
postingID := idDictionary[id] | |||
if postingID > 0 { | |||
rv.Or(s.Postings[postingID-1]) | |||
} | |||
} | |||
} | |||
return rv, nil | |||
} | |||
// Close releases all resources associated with this segment | |||
func (s *Segment) Close() error { | |||
return nil | |||
} |
@@ -0,0 +1,75 @@ | |||
// Copyright (c) 2018 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package segment | |||
import ( | |||
"regexp/syntax" | |||
"github.com/couchbase/vellum/regexp" | |||
) | |||
func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) { | |||
// TODO: potential optimization where syntax.Regexp supports a Simplify() API? | |||
parsed, err := syntax.Parse(pattern, syntax.Perl) | |||
if err != nil { | |||
return nil, nil, nil, err | |||
} | |||
re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit) | |||
if err != nil { | |||
return nil, nil, nil, err | |||
} | |||
prefix := LiteralPrefix(parsed) | |||
if prefix != "" { | |||
prefixBeg := []byte(prefix) | |||
prefixEnd := IncrementBytes(prefixBeg) | |||
return re, prefixBeg, prefixEnd, nil | |||
} | |||
return re, nil, nil, nil | |||
} | |||
// Returns the literal prefix given the parse tree for a regexp | |||
func LiteralPrefix(s *syntax.Regexp) string { | |||
// traverse the left-most branch in the parse tree as long as the | |||
// node represents a concatenation | |||
for s != nil && s.Op == syntax.OpConcat { | |||
if len(s.Sub) < 1 { | |||
return "" | |||
} | |||
s = s.Sub[0] | |||
} | |||
if s.Op == syntax.OpLiteral { | |||
return string(s.Rune) | |||
} | |||
return "" // no literal prefix | |||
} | |||
func IncrementBytes(in []byte) []byte { | |||
rv := make([]byte, len(in)) | |||
copy(rv, in) | |||
for i := len(rv) - 1; i >= 0; i-- { | |||
rv[i] = rv[i] + 1 | |||
if rv[i] != 0 { | |||
return rv // didn't overflow, so stop | |||
} | |||
} | |||
return nil // overflowed | |||
} |
@@ -15,15 +15,14 @@ | |||
package segment | |||
import ( | |||
"fmt" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/couchbase/vellum" | |||
) | |||
// Overhead from go data structures when deployed on a 64-bit system. | |||
const SizeOfMap uint64 = 8 | |||
const SizeOfPointer uint64 = 8 | |||
const SizeOfSlice uint64 = 24 | |||
const SizeOfString uint64 = 16 | |||
var ErrClosed = fmt.Errorf("index closed") | |||
// DocumentFieldValueVisitor defines a callback to be visited for each | |||
// stored field value. The return value determines if the visitor | |||
@@ -34,6 +33,9 @@ type Segment interface { | |||
Dictionary(field string) (TermDictionary, error) | |||
VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error | |||
DocID(num uint64) ([]byte, error) | |||
Count() uint64 | |||
DocNumbers([]string) (*roaring.Bitmap, error) | |||
@@ -42,18 +44,21 @@ type Segment interface { | |||
Close() error | |||
SizeInBytes() uint64 | |||
Size() int | |||
AddRef() | |||
DecRef() error | |||
} | |||
type TermDictionary interface { | |||
PostingsList(term string, except *roaring.Bitmap) (PostingsList, error) | |||
PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) | |||
Iterator() DictionaryIterator | |||
PrefixIterator(prefix string) DictionaryIterator | |||
RangeIterator(start, end string) DictionaryIterator | |||
AutomatonIterator(a vellum.Automaton, | |||
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator | |||
OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator | |||
} | |||
type DictionaryIterator interface { | |||
@@ -61,7 +66,9 @@ type DictionaryIterator interface { | |||
} | |||
type PostingsList interface { | |||
Iterator() PostingsIterator | |||
Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator | |||
Size() int | |||
Count() uint64 | |||
@@ -77,6 +84,14 @@ type PostingsIterator interface { | |||
// implementations may return a shared instance to reduce memory | |||
// allocations. | |||
Next() (Posting, error) | |||
// Advance will return the posting with the specified doc number | |||
// or if there is no such posting, the next posting. | |||
// Callers MUST NOT attempt to pass a docNum that is less than or | |||
// equal to the currently visited posting doc Num. | |||
Advance(docNum uint64) (Posting, error) | |||
Size() int | |||
} | |||
type Posting interface { | |||
@@ -86,6 +101,8 @@ type Posting interface { | |||
Norm() float64 | |||
Locations() []Location | |||
Size() int | |||
} | |||
type Location interface { | |||
@@ -94,6 +111,7 @@ type Location interface { | |||
End() uint64 | |||
Pos() uint64 | |||
ArrayPositions() []uint64 | |||
Size() int | |||
} | |||
// DocumentFieldTermVisitable is implemented by various scorch segment | |||
@@ -101,10 +119,17 @@ type Location interface { | |||
// postings or other indexed values. | |||
type DocumentFieldTermVisitable interface { | |||
VisitDocumentFieldTerms(localDocNum uint64, fields []string, | |||
visitor index.DocumentFieldTermVisitor) error | |||
visitor index.DocumentFieldTermVisitor, optional DocVisitState) (DocVisitState, error) | |||
// VisitableDocValueFields implementation should return | |||
// the list of fields which are document value persisted and | |||
// therefore visitable by the above VisitDocumentFieldTerms method. | |||
VisitableDocValueFields() ([]string, error) | |||
} | |||
type DocVisitState interface { | |||
} | |||
type StatsReporter interface { | |||
ReportBytesWritten(bytesWritten uint64) | |||
} |
@@ -16,19 +16,13 @@ package zap | |||
import ( | |||
"bufio" | |||
"bytes" | |||
"encoding/binary" | |||
"math" | |||
"os" | |||
"sort" | |||
"github.com/Smerity/govarint" | |||
"github.com/blevesearch/bleve/index/scorch/segment/mem" | |||
"github.com/couchbase/vellum" | |||
"github.com/golang/snappy" | |||
) | |||
const version uint32 = 3 | |||
const Version uint32 = 11 | |||
const Type string = "zap" | |||
const fieldNotUninverted = math.MaxUint64 | |||
@@ -82,219 +76,39 @@ func PersistSegmentBase(sb *SegmentBase, path string) error { | |||
return nil | |||
} | |||
// PersistSegment takes the in-memory segment and persists it to | |||
// the specified path in the zap file format. | |||
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error { | |||
flag := os.O_RDWR | os.O_CREATE | |||
f, err := os.OpenFile(path, flag, 0600) | |||
if err != nil { | |||
return err | |||
} | |||
cleanup := func() { | |||
_ = f.Close() | |||
_ = os.Remove(path) | |||
} | |||
// buffer the output | |||
br := bufio.NewWriter(f) | |||
// wrap it for counting (tracking offsets) | |||
cr := NewCountHashWriter(br) | |||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err := | |||
persistBase(memSegment, cr, chunkFactor) | |||
if err != nil { | |||
cleanup() | |||
return err | |||
} | |||
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, | |||
chunkFactor, cr.Sum32(), cr) | |||
if err != nil { | |||
cleanup() | |||
return err | |||
} | |||
err = br.Flush() | |||
if err != nil { | |||
cleanup() | |||
return err | |||
} | |||
err = f.Sync() | |||
if err != nil { | |||
cleanup() | |||
return err | |||
} | |||
err = f.Close() | |||
if err != nil { | |||
cleanup() | |||
return err | |||
} | |||
return nil | |||
} | |||
func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) ( | |||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, | |||
dictLocs []uint64, err error) { | |||
docValueOffset = uint64(fieldNotUninverted) | |||
if len(memSegment.Stored) > 0 { | |||
storedIndexOffset, err = persistStored(memSegment, cr) | |||
if err != nil { | |||
return 0, 0, 0, 0, nil, err | |||
} | |||
freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor) | |||
if err != nil { | |||
return 0, 0, 0, 0, nil, err | |||
} | |||
postingsListLocs, err := persistPostingsLocs(memSegment, cr) | |||
if err != nil { | |||
return 0, 0, 0, 0, nil, err | |||
} | |||
postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets) | |||
if err != nil { | |||
return 0, 0, 0, 0, nil, err | |||
} | |||
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) | |||
if err != nil { | |||
return 0, 0, 0, 0, nil, err | |||
} | |||
docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor) | |||
if err != nil { | |||
return 0, 0, 0, 0, nil, err | |||
} | |||
} else { | |||
dictLocs = make([]uint64, len(memSegment.FieldsInv)) | |||
} | |||
fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs) | |||
if err != nil { | |||
return 0, 0, 0, 0, nil, err | |||
} | |||
return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset, | |||
dictLocs, nil | |||
} | |||
func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) { | |||
var curr int | |||
var metaBuf bytes.Buffer | |||
var data, compressed []byte | |||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) | |||
docNumOffsets := make(map[int]uint64, len(memSegment.Stored)) | |||
for docNum, storedValues := range memSegment.Stored { | |||
if docNum != 0 { | |||
// reset buffer if necessary | |||
curr = 0 | |||
metaBuf.Reset() | |||
data = data[:0] | |||
compressed = compressed[:0] | |||
} | |||
st := memSegment.StoredTypes[docNum] | |||
sp := memSegment.StoredPos[docNum] | |||
// encode fields in order | |||
for fieldID := range memSegment.FieldsInv { | |||
if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { | |||
stf := st[uint16(fieldID)] | |||
spf := sp[uint16(fieldID)] | |||
var err2 error | |||
curr, data, err2 = persistStoredFieldValues(fieldID, | |||
storedFieldValues, stf, spf, curr, metaEncoder, data) | |||
if err2 != nil { | |||
return 0, err2 | |||
} | |||
} | |||
} | |||
metaEncoder.Close() | |||
metaBytes := metaBuf.Bytes() | |||
// compress the data | |||
compressed = snappy.Encode(compressed, data) | |||
// record where we're about to start writing | |||
docNumOffsets[docNum] = uint64(w.Count()) | |||
// write out the meta len and compressed data len | |||
_, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) | |||
if err != nil { | |||
return 0, err | |||
} | |||
// now write the meta | |||
_, err = w.Write(metaBytes) | |||
if err != nil { | |||
return 0, err | |||
} | |||
// now write the compressed data | |||
_, err = w.Write(compressed) | |||
if err != nil { | |||
return 0, err | |||
} | |||
} | |||
// return value is the start of the stored index | |||
rv := uint64(w.Count()) | |||
// now write out the stored doc index | |||
for docNum := range memSegment.Stored { | |||
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) | |||
if err != nil { | |||
return 0, err | |||
} | |||
} | |||
return rv, nil | |||
} | |||
func persistStoredFieldValues(fieldID int, | |||
storedFieldValues [][]byte, stf []byte, spf [][]uint64, | |||
curr int, metaEncoder *govarint.Base128Encoder, data []byte) ( | |||
curr int, metaEncode varintEncoder, data []byte) ( | |||
int, []byte, error) { | |||
for i := 0; i < len(storedFieldValues); i++ { | |||
// encode field | |||
_, err := metaEncoder.PutU64(uint64(fieldID)) | |||
_, err := metaEncode(uint64(fieldID)) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// encode type | |||
_, err = metaEncoder.PutU64(uint64(stf[i])) | |||
_, err = metaEncode(uint64(stf[i])) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// encode start offset | |||
_, err = metaEncoder.PutU64(uint64(curr)) | |||
_, err = metaEncode(uint64(curr)) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// end len | |||
_, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) | |||
_, err = metaEncode(uint64(len(storedFieldValues[i]))) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// encode number of array pos | |||
_, err = metaEncoder.PutU64(uint64(len(spf[i]))) | |||
_, err = metaEncode(uint64(len(spf[i]))) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// encode all array positions | |||
for _, pos := range spf[i] { | |||
_, err = metaEncoder.PutU64(pos) | |||
_, err = metaEncode(pos) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
@@ -307,337 +121,6 @@ func persistStoredFieldValues(fieldID int, | |||
return curr, data, nil | |||
} | |||
func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { | |||
var freqOffsets, locOfffsets []uint64 | |||
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) | |||
for postingID := range memSegment.Postings { | |||
if postingID != 0 { | |||
tfEncoder.Reset() | |||
} | |||
freqs := memSegment.Freqs[postingID] | |||
norms := memSegment.Norms[postingID] | |||
postingsListItr := memSegment.Postings[postingID].Iterator() | |||
var offset int | |||
for postingsListItr.HasNext() { | |||
docNum := uint64(postingsListItr.Next()) | |||
// put freq | |||
err := tfEncoder.Add(docNum, freqs[offset]) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
// put norm | |||
norm := norms[offset] | |||
normBits := math.Float32bits(norm) | |||
err = tfEncoder.Add(docNum, uint64(normBits)) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
offset++ | |||
} | |||
// record where this postings freq info starts | |||
freqOffsets = append(freqOffsets, uint64(w.Count())) | |||
tfEncoder.Close() | |||
_, err := tfEncoder.Write(w) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
} | |||
// now do it again for the locations | |||
locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) | |||
for postingID := range memSegment.Postings { | |||
if postingID != 0 { | |||
locEncoder.Reset() | |||
} | |||
freqs := memSegment.Freqs[postingID] | |||
locfields := memSegment.Locfields[postingID] | |||
locpos := memSegment.Locpos[postingID] | |||
locstarts := memSegment.Locstarts[postingID] | |||
locends := memSegment.Locends[postingID] | |||
locarraypos := memSegment.Locarraypos[postingID] | |||
postingsListItr := memSegment.Postings[postingID].Iterator() | |||
var offset int | |||
var locOffset int | |||
for postingsListItr.HasNext() { | |||
docNum := uint64(postingsListItr.Next()) | |||
for i := 0; i < int(freqs[offset]); i++ { | |||
if len(locfields) > 0 { | |||
// put field | |||
err := locEncoder.Add(docNum, uint64(locfields[locOffset])) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
// put pos | |||
err = locEncoder.Add(docNum, locpos[locOffset]) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
// put start | |||
err = locEncoder.Add(docNum, locstarts[locOffset]) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
// put end | |||
err = locEncoder.Add(docNum, locends[locOffset]) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
// put the number of array positions to follow | |||
num := len(locarraypos[locOffset]) | |||
err = locEncoder.Add(docNum, uint64(num)) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
// put each array position | |||
for _, pos := range locarraypos[locOffset] { | |||
err = locEncoder.Add(docNum, pos) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
} | |||
} | |||
locOffset++ | |||
} | |||
offset++ | |||
} | |||
// record where this postings loc info starts | |||
locOfffsets = append(locOfffsets, uint64(w.Count())) | |||
locEncoder.Close() | |||
_, err := locEncoder.Write(w) | |||
if err != nil { | |||
return nil, nil, err | |||
} | |||
} | |||
return freqOffsets, locOfffsets, nil | |||
} | |||
func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { | |||
rv = make([]uint64, 0, len(memSegment.PostingsLocs)) | |||
var reuseBuf bytes.Buffer | |||
reuseBufVarint := make([]byte, binary.MaxVarintLen64) | |||
for postingID := range memSegment.PostingsLocs { | |||
// record where we start this posting loc | |||
rv = append(rv, uint64(w.Count())) | |||
// write out the length and bitmap | |||
_, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
return rv, nil | |||
} | |||
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, | |||
postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { | |||
rv = make([]uint64, 0, len(memSegment.Postings)) | |||
var reuseBuf bytes.Buffer | |||
reuseBufVarint := make([]byte, binary.MaxVarintLen64) | |||
for postingID := range memSegment.Postings { | |||
// record where we start this posting list | |||
rv = append(rv, uint64(w.Count())) | |||
// write out the term info, loc info, and loc posting list offset | |||
_, err = writeUvarints(w, freqOffsets[postingID], | |||
locOffsets[postingID], postingsListLocs[postingID]) | |||
if err != nil { | |||
return nil, err | |||
} | |||
// write out the length and bitmap | |||
_, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
return rv, nil | |||
} | |||
func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { | |||
rv := make([]uint64, 0, len(memSegment.DictKeys)) | |||
varintBuf := make([]byte, binary.MaxVarintLen64) | |||
var buffer bytes.Buffer | |||
for fieldID, fieldTerms := range memSegment.DictKeys { | |||
if fieldID != 0 { | |||
buffer.Reset() | |||
} | |||
// start a new vellum for this field | |||
builder, err := vellum.New(&buffer, nil) | |||
if err != nil { | |||
return nil, err | |||
} | |||
dict := memSegment.Dicts[fieldID] | |||
// now walk the dictionary in order of fieldTerms (already sorted) | |||
for _, fieldTerm := range fieldTerms { | |||
postingID := dict[fieldTerm] - 1 | |||
postingsAddr := postingsLocs[postingID] | |||
err = builder.Insert([]byte(fieldTerm), postingsAddr) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
err = builder.Close() | |||
if err != nil { | |||
return nil, err | |||
} | |||
// record where this dictionary starts | |||
rv = append(rv, uint64(w.Count())) | |||
vellumData := buffer.Bytes() | |||
// write out the length of the vellum data | |||
n := binary.PutUvarint(varintBuf, uint64(len(vellumData))) | |||
_, err = w.Write(varintBuf[:n]) | |||
if err != nil { | |||
return nil, err | |||
} | |||
// write this vellum to disk | |||
_, err = w.Write(vellumData) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
return rv, nil | |||
} | |||
type docIDRange []uint64 | |||
func (a docIDRange) Len() int { return len(a) } | |||
func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] } | |||
func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] } | |||
func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, | |||
chunkFactor uint32) (map[uint16]uint64, error) { | |||
fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv)) | |||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) | |||
for fieldID := range memSegment.DocValueFields { | |||
field := memSegment.FieldsInv[fieldID] | |||
docTermMap := make(map[uint64][]byte, 0) | |||
dict, err := memSegment.Dictionary(field) | |||
if err != nil { | |||
return nil, err | |||
} | |||
dictItr := dict.Iterator() | |||
next, err := dictItr.Next() | |||
for err == nil && next != nil { | |||
postings, err1 := dict.PostingsList(next.Term, nil) | |||
if err1 != nil { | |||
return nil, err | |||
} | |||
postingsItr := postings.Iterator() | |||
nextPosting, err2 := postingsItr.Next() | |||
for err2 == nil && nextPosting != nil { | |||
docNum := nextPosting.Number() | |||
docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...) | |||
docTermMap[docNum] = append(docTermMap[docNum], termSeparator) | |||
nextPosting, err2 = postingsItr.Next() | |||
} | |||
if err2 != nil { | |||
return nil, err2 | |||
} | |||
next, err = dictItr.Next() | |||
} | |||
if err != nil { | |||
return nil, err | |||
} | |||
// sort wrt to docIDs | |||
var docNumbers docIDRange | |||
for k := range docTermMap { | |||
docNumbers = append(docNumbers, k) | |||
} | |||
sort.Sort(docNumbers) | |||
for _, docNum := range docNumbers { | |||
err = fdvEncoder.Add(docNum, docTermMap[docNum]) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
fieldChunkOffsets[fieldID] = uint64(w.Count()) | |||
err = fdvEncoder.Close() | |||
if err != nil { | |||
return nil, err | |||
} | |||
// persist the doc value details for this field | |||
_, err = fdvEncoder.Write(w) | |||
if err != nil { | |||
return nil, err | |||
} | |||
// reseting encoder for the next field | |||
fdvEncoder.Reset() | |||
} | |||
return fieldChunkOffsets, nil | |||
} | |||
func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter, | |||
chunkFactor uint32) (uint64, error) { | |||
fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor) | |||
if err != nil { | |||
return 0, err | |||
} | |||
fieldDocValuesOffset := uint64(w.Count()) | |||
buf := make([]byte, binary.MaxVarintLen64) | |||
offset := uint64(0) | |||
ok := true | |||
for fieldID := range memSegment.FieldsInv { | |||
// if the field isn't configured for docValue, then mark | |||
// the offset accordingly | |||
if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok { | |||
offset = fieldNotUninverted | |||
} | |||
n := binary.PutUvarint(buf, uint64(offset)) | |||
_, err := w.Write(buf[:n]) | |||
if err != nil { | |||
return 0, err | |||
} | |||
} | |||
return fieldDocValuesOffset, nil | |||
} | |||
func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) { | |||
var br bytes.Buffer | |||
cr := NewCountHashWriter(&br) | |||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err := | |||
persistBase(memSegment, cr, chunkFactor) | |||
if err != nil { | |||
return nil, err | |||
} | |||
return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, | |||
memSegment.FieldsMap, memSegment.FieldsInv, numDocs, | |||
storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs) | |||
} | |||
func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, | |||
fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, | |||
storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, | |||
@@ -653,10 +136,11 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, | |||
fieldsIndexOffset: fieldsIndexOffset, | |||
docValueOffset: docValueOffset, | |||
dictLocs: dictLocs, | |||
fieldDvIterMap: make(map[uint16]*docValueIterator), | |||
fieldDvReaders: make(map[uint16]*docValueReader), | |||
} | |||
sb.updateSize() | |||
err := sb.loadDvIterators() | |||
err := sb.loadDvReaders() | |||
if err != nil { | |||
return nil, err | |||
} |
@@ -18,41 +18,56 @@ import ( | |||
"bytes" | |||
"encoding/binary" | |||
"io" | |||
"reflect" | |||
"github.com/golang/snappy" | |||
) | |||
var reflectStaticSizeMetaData int | |||
func init() { | |||
var md MetaData | |||
reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) | |||
} | |||
var termSeparator byte = 0xff | |||
var termSeparatorSplitSlice = []byte{termSeparator} | |||
type chunkedContentCoder struct { | |||
final []byte | |||
chunkSize uint64 | |||
currChunk uint64 | |||
chunkLens []uint64 | |||
final []byte | |||
chunkSize uint64 | |||
currChunk uint64 | |||
chunkLens []uint64 | |||
w io.Writer | |||
progressiveWrite bool | |||
chunkMetaBuf bytes.Buffer | |||
chunkBuf bytes.Buffer | |||
chunkMeta []MetaData | |||
compressed []byte // temp buf for snappy compression | |||
} | |||
// MetaData represents the data information inside a | |||
// chunk. | |||
type MetaData struct { | |||
DocNum uint64 // docNum of the data inside the chunk | |||
DocDvLoc uint64 // starting offset for a given docid | |||
DocDvLen uint64 // length of data inside the chunk for the given docid | |||
DocNum uint64 // docNum of the data inside the chunk | |||
DocDvOffset uint64 // offset of data inside the chunk for the given docid | |||
} | |||
// newChunkedContentCoder returns a new chunk content coder which | |||
// packs data into chunks based on the provided chunkSize | |||
func newChunkedContentCoder(chunkSize uint64, | |||
maxDocNum uint64) *chunkedContentCoder { | |||
func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64, | |||
w io.Writer, progressiveWrite bool) *chunkedContentCoder { | |||
total := maxDocNum/chunkSize + 1 | |||
rv := &chunkedContentCoder{ | |||
chunkSize: chunkSize, | |||
chunkLens: make([]uint64, total), | |||
chunkMeta: make([]MetaData, 0, total), | |||
chunkSize: chunkSize, | |||
chunkLens: make([]uint64, total), | |||
chunkMeta: make([]MetaData, 0, total), | |||
w: w, | |||
progressiveWrite: progressiveWrite, | |||
} | |||
return rv | |||
@@ -88,7 +103,7 @@ func (c *chunkedContentCoder) flushContents() error { | |||
// write out the metaData slice | |||
for _, meta := range c.chunkMeta { | |||
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen) | |||
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) | |||
if err != nil { | |||
return err | |||
} | |||
@@ -98,10 +113,19 @@ func (c *chunkedContentCoder) flushContents() error { | |||
metaData := c.chunkMetaBuf.Bytes() | |||
c.final = append(c.final, c.chunkMetaBuf.Bytes()...) | |||
// write the compressed data to the final data | |||
compressedData := snappy.Encode(nil, c.chunkBuf.Bytes()) | |||
c.final = append(c.final, compressedData...) | |||
c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes()) | |||
c.final = append(c.final, c.compressed...) | |||
c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) | |||
if c.progressiveWrite { | |||
_, err := c.w.Write(c.final) | |||
if err != nil { | |||
return err | |||
} | |||
c.final = c.final[:0] | |||
} | |||
c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData)) | |||
return nil | |||
} | |||
@@ -122,7 +146,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { | |||
c.currChunk = chunk | |||
} | |||
// mark the starting offset for this doc | |||
// get the starting offset for this doc | |||
dvOffset := c.chunkBuf.Len() | |||
dvSize, err := c.chunkBuf.Write(vals) | |||
if err != nil { | |||
@@ -130,38 +154,77 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { | |||
} | |||
c.chunkMeta = append(c.chunkMeta, MetaData{ | |||
DocNum: docNum, | |||
DocDvLoc: uint64(dvOffset), | |||
DocDvLen: uint64(dvSize), | |||
DocNum: docNum, | |||
DocDvOffset: uint64(dvOffset + dvSize), | |||
}) | |||
return nil | |||
} | |||
// Write commits all the encoded chunked contents to the provided writer. | |||
func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { | |||
// | |||
// | ..... data ..... | chunk offsets (varints) | |||
// | position of chunk offsets (uint64) | number of offsets (uint64) | | |||
// | |||
func (c *chunkedContentCoder) Write() (int, error) { | |||
var tw int | |||
buf := make([]byte, binary.MaxVarintLen64) | |||
// write out the number of chunks | |||
n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) | |||
nw, err := w.Write(buf[:n]) | |||
tw += nw | |||
if err != nil { | |||
return tw, err | |||
if c.final != nil { | |||
// write out the data section first | |||
nw, err := c.w.Write(c.final) | |||
tw += nw | |||
if err != nil { | |||
return tw, err | |||
} | |||
} | |||
chunkOffsetsStart := uint64(tw) | |||
if cap(c.final) < binary.MaxVarintLen64 { | |||
c.final = make([]byte, binary.MaxVarintLen64) | |||
} else { | |||
c.final = c.final[0:binary.MaxVarintLen64] | |||
} | |||
// write out the chunk lens | |||
for _, chunkLen := range c.chunkLens { | |||
n := binary.PutUvarint(buf, uint64(chunkLen)) | |||
nw, err = w.Write(buf[:n]) | |||
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) | |||
// write out the chunk offsets | |||
for _, chunkOffset := range chunkOffsets { | |||
n := binary.PutUvarint(c.final, chunkOffset) | |||
nw, err := c.w.Write(c.final[:n]) | |||
tw += nw | |||
if err != nil { | |||
return tw, err | |||
} | |||
} | |||
// write out the data | |||
nw, err = w.Write(c.final) | |||
chunkOffsetsLen := uint64(tw) - chunkOffsetsStart | |||
c.final = c.final[0:8] | |||
// write out the length of chunk offsets | |||
binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) | |||
nw, err := c.w.Write(c.final) | |||
tw += nw | |||
if err != nil { | |||
return tw, err | |||
} | |||
// write out the number of chunks | |||
binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) | |||
nw, err = c.w.Write(c.final) | |||
tw += nw | |||
if err != nil { | |||
return tw, err | |||
} | |||
c.final = c.final[:0] | |||
return tw, nil | |||
} | |||
// ReadDocValueBoundary elicits the start, end offsets from a | |||
// metaData header slice | |||
func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { | |||
var start uint64 | |||
if chunk > 0 { | |||
start = metaHeaders[chunk-1].DocDvOffset | |||
} | |||
return start, metaHeaders[chunk].DocDvOffset | |||
} |
@@ -17,6 +17,8 @@ package zap | |||
import ( | |||
"hash/crc32" | |||
"io" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
) | |||
// CountHashWriter is a wrapper around a Writer which counts the number of | |||
@@ -25,6 +27,7 @@ type CountHashWriter struct { | |||
w io.Writer | |||
crc uint32 | |||
n int | |||
s segment.StatsReporter | |||
} | |||
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer | |||
@@ -32,11 +35,18 @@ func NewCountHashWriter(w io.Writer) *CountHashWriter { | |||
return &CountHashWriter{w: w} | |||
} | |||
func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter { | |||
return &CountHashWriter{w: w, s: s} | |||
} | |||
// Write writes the provided bytes to the wrapped writer and counts the bytes | |||
func (c *CountHashWriter) Write(b []byte) (int, error) { | |||
n, err := c.w.Write(b) | |||
c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) | |||
c.n += n | |||
if c.s != nil { | |||
c.s.ReportBytesWritten(uint64(n)) | |||
} | |||
return n, err | |||
} | |||
@@ -15,38 +15,51 @@ | |||
package zap | |||
import ( | |||
"bytes" | |||
"fmt" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/couchbase/vellum" | |||
"github.com/couchbase/vellum/regexp" | |||
) | |||
// Dictionary is the zap representation of the term dictionary | |||
type Dictionary struct { | |||
sb *SegmentBase | |||
field string | |||
fieldID uint16 | |||
fst *vellum.FST | |||
sb *SegmentBase | |||
field string | |||
fieldID uint16 | |||
fst *vellum.FST | |||
fstReader *vellum.Reader | |||
} | |||
// PostingsList returns the postings list for the specified term | |||
func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { | |||
return d.postingsList([]byte(term), except, nil) | |||
func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, | |||
prealloc segment.PostingsList) (segment.PostingsList, error) { | |||
var preallocPL *PostingsList | |||
pl, ok := prealloc.(*PostingsList) | |||
if ok && pl != nil { | |||
preallocPL = pl | |||
} | |||
return d.postingsList(term, except, preallocPL) | |||
} | |||
func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { | |||
if d.fst == nil { | |||
if d.fstReader == nil { | |||
if rv == nil || rv == emptyPostingsList { | |||
return emptyPostingsList, nil | |||
} | |||
return d.postingsListInit(rv, except), nil | |||
} | |||
postingsOffset, exists, err := d.fst.Get(term) | |||
postingsOffset, exists, err := d.fstReader.Get(term) | |||
if err != nil { | |||
return nil, fmt.Errorf("vellum err: %v", err) | |||
} | |||
if !exists { | |||
if rv == nil || rv == emptyPostingsList { | |||
return emptyPostingsList, nil | |||
} | |||
return d.postingsListInit(rv, except), nil | |||
} | |||
@@ -65,10 +78,17 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari | |||
} | |||
func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { | |||
if rv == nil { | |||
if rv == nil || rv == emptyPostingsList { | |||
rv = &PostingsList{} | |||
} else { | |||
postings := rv.postings | |||
if postings != nil { | |||
postings.Clear() | |||
} | |||
*rv = PostingsList{} // clear the struct | |||
rv.postings = postings | |||
} | |||
rv.sb = d.sb | |||
rv.except = except | |||
@@ -85,6 +105,8 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator { | |||
itr, err := d.fst.Iterator(nil, nil) | |||
if err == nil { | |||
rv.itr = itr | |||
} else if err != vellum.ErrIteratorDone { | |||
rv.err = err | |||
} | |||
} | |||
@@ -98,13 +120,15 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { | |||
d: d, | |||
} | |||
kBeg := []byte(prefix) | |||
kEnd := segment.IncrementBytes(kBeg) | |||
if d.fst != nil { | |||
r, err := regexp.New(prefix + ".*") | |||
itr, err := d.fst.Iterator(kBeg, kEnd) | |||
if err == nil { | |||
itr, err := d.fst.Search(r, nil, nil) | |||
if err == nil { | |||
rv.itr = itr | |||
} | |||
rv.itr = itr | |||
} else if err != vellum.ErrIteratorDone { | |||
rv.err = err | |||
} | |||
} | |||
@@ -130,36 +154,103 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator | |||
itr, err := d.fst.Iterator([]byte(start), endBytes) | |||
if err == nil { | |||
rv.itr = itr | |||
} else if err != vellum.ErrIteratorDone { | |||
rv.err = err | |||
} | |||
} | |||
return rv | |||
} | |||
// AutomatonIterator returns an iterator which only visits terms | |||
// having the the vellum automaton and start/end key range | |||
func (d *Dictionary) AutomatonIterator(a vellum.Automaton, | |||
startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { | |||
rv := &DictionaryIterator{ | |||
d: d, | |||
} | |||
if d.fst != nil { | |||
itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) | |||
if err == nil { | |||
rv.itr = itr | |||
} else if err != vellum.ErrIteratorDone { | |||
rv.err = err | |||
} | |||
} | |||
return rv | |||
} | |||
func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, | |||
includeCount bool) segment.DictionaryIterator { | |||
rv := &DictionaryIterator{ | |||
d: d, | |||
omitCount: !includeCount, | |||
} | |||
var buf bytes.Buffer | |||
builder, err := vellum.New(&buf, nil) | |||
if err != nil { | |||
rv.err = err | |||
return rv | |||
} | |||
for _, term := range onlyTerms { | |||
err = builder.Insert(term, 0) | |||
if err != nil { | |||
rv.err = err | |||
return rv | |||
} | |||
} | |||
err = builder.Close() | |||
if err != nil { | |||
rv.err = err | |||
return rv | |||
} | |||
onlyFST, err := vellum.Load(buf.Bytes()) | |||
if err != nil { | |||
rv.err = err | |||
return rv | |||
} | |||
itr, err := d.fst.Search(onlyFST, nil, nil) | |||
if err == nil { | |||
rv.itr = itr | |||
} else if err != vellum.ErrIteratorDone { | |||
rv.err = err | |||
} | |||
return rv | |||
} | |||
// DictionaryIterator is an iterator for term dictionary | |||
type DictionaryIterator struct { | |||
d *Dictionary | |||
itr vellum.Iterator | |||
err error | |||
tmp PostingsList | |||
d *Dictionary | |||
itr vellum.Iterator | |||
err error | |||
tmp PostingsList | |||
entry index.DictEntry | |||
omitCount bool | |||
} | |||
// Next returns the next entry in the dictionary | |||
func (i *DictionaryIterator) Next() (*index.DictEntry, error) { | |||
if i.itr == nil || i.err == vellum.ErrIteratorDone { | |||
return nil, nil | |||
} else if i.err != nil { | |||
if i.err != nil && i.err != vellum.ErrIteratorDone { | |||
return nil, i.err | |||
} else if i.itr == nil || i.err == vellum.ErrIteratorDone { | |||
return nil, nil | |||
} | |||
term, postingsOffset := i.itr.Current() | |||
i.err = i.tmp.read(postingsOffset, i.d) | |||
if i.err != nil { | |||
return nil, i.err | |||
} | |||
rv := &index.DictEntry{ | |||
Term: string(term), | |||
Count: i.tmp.Count(), | |||
i.entry.Term = string(term) | |||
if !i.omitCount { | |||
i.err = i.tmp.read(postingsOffset, i.d) | |||
if i.err != nil { | |||
return nil, i.err | |||
} | |||
i.entry.Count = i.tmp.Count() | |||
} | |||
i.err = i.itr.Next() | |||
return rv, nil | |||
return &i.entry, nil | |||
} |
@@ -19,93 +19,129 @@ import ( | |||
"encoding/binary" | |||
"fmt" | |||
"math" | |||
"reflect" | |||
"sort" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/blevesearch/bleve/size" | |||
"github.com/golang/snappy" | |||
) | |||
type docValueIterator struct { | |||
var reflectStaticSizedocValueReader int | |||
func init() { | |||
var dvi docValueReader | |||
reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) | |||
} | |||
type docNumTermsVisitor func(docNum uint64, terms []byte) error | |||
type docVisitState struct { | |||
dvrs map[uint16]*docValueReader | |||
segment *Segment | |||
} | |||
type docValueReader struct { | |||
field string | |||
curChunkNum uint64 | |||
numChunks uint64 | |||
chunkLens []uint64 | |||
chunkOffsets []uint64 | |||
dvDataLoc uint64 | |||
curChunkHeader []MetaData | |||
curChunkData []byte // compressed data cache | |||
uncompressed []byte // temp buf for snappy decompression | |||
} | |||
func (di *docValueIterator) sizeInBytes() uint64 { | |||
// curChunkNum, numChunks, dvDataLoc --> uint64 | |||
sizeInBytes := 24 | |||
// field | |||
sizeInBytes += (len(di.field) + int(segment.SizeOfString)) | |||
func (di *docValueReader) size() int { | |||
return reflectStaticSizedocValueReader + size.SizeOfPtr + | |||
len(di.field) + | |||
len(di.chunkOffsets)*size.SizeOfUint64 + | |||
len(di.curChunkHeader)*reflectStaticSizeMetaData + | |||
len(di.curChunkData) | |||
} | |||
// chunkLens, curChunkHeader | |||
sizeInBytes += len(di.chunkLens)*8 + | |||
len(di.curChunkHeader)*24 + | |||
int(segment.SizeOfSlice*2) /* overhead from slices */ | |||
func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { | |||
if rv == nil { | |||
rv = &docValueReader{} | |||
} | |||
// curChunkData is mmap'ed, not included | |||
rv.field = di.field | |||
rv.curChunkNum = math.MaxUint64 | |||
rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable | |||
rv.dvDataLoc = di.dvDataLoc | |||
rv.curChunkHeader = rv.curChunkHeader[:0] | |||
rv.curChunkData = nil | |||
rv.uncompressed = rv.uncompressed[:0] | |||
return uint64(sizeInBytes) | |||
return rv | |||
} | |||
func (di *docValueIterator) fieldName() string { | |||
func (di *docValueReader) fieldName() string { | |||
return di.field | |||
} | |||
func (di *docValueIterator) curChunkNumber() uint64 { | |||
func (di *docValueReader) curChunkNumber() uint64 { | |||
return di.curChunkNum | |||
} | |||
func (s *SegmentBase) loadFieldDocValueIterator(field string, | |||
fieldDvLoc uint64) (*docValueIterator, error) { | |||
func (s *SegmentBase) loadFieldDocValueReader(field string, | |||
fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { | |||
// get the docValue offset for the given fields | |||
if fieldDvLoc == fieldNotUninverted { | |||
return nil, fmt.Errorf("loadFieldDocValueIterator: "+ | |||
if fieldDvLocStart == fieldNotUninverted { | |||
return nil, fmt.Errorf("loadFieldDocValueReader: "+ | |||
"no docValues found for field: %s", field) | |||
} | |||
// read the number of chunks, chunk lengths | |||
var offset, clen uint64 | |||
numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) | |||
if read <= 0 { | |||
return nil, fmt.Errorf("failed to read the field "+ | |||
"doc values for field %s", field) | |||
// read the number of chunks, and chunk offsets position | |||
var numChunks, chunkOffsetsPosition uint64 | |||
if fieldDvLocEnd-fieldDvLocStart > 16 { | |||
numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd]) | |||
// read the length of chunk offsets | |||
chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) | |||
// acquire position of chunk offsets | |||
chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen | |||
} | |||
offset += uint64(read) | |||
fdvIter := &docValueIterator{ | |||
curChunkNum: math.MaxUint64, | |||
field: field, | |||
chunkLens: make([]uint64, int(numChunks)), | |||
fdvIter := &docValueReader{ | |||
curChunkNum: math.MaxUint64, | |||
field: field, | |||
chunkOffsets: make([]uint64, int(numChunks)), | |||
} | |||
// read the chunk offsets | |||
var offset uint64 | |||
for i := 0; i < int(numChunks); i++ { | |||
clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) | |||
loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64]) | |||
if read <= 0 { | |||
return nil, fmt.Errorf("corrupted chunk length during segment load") | |||
return nil, fmt.Errorf("corrupted chunk offset during segment load") | |||
} | |||
fdvIter.chunkLens[i] = clen | |||
fdvIter.chunkOffsets[i] = loc | |||
offset += uint64(read) | |||
} | |||
fdvIter.dvDataLoc = fieldDvLoc + offset | |||
// set the data offset | |||
fdvIter.dvDataLoc = fieldDvLocStart | |||
return fdvIter, nil | |||
} | |||
func (di *docValueIterator) loadDvChunk(chunkNumber, | |||
localDocNum uint64, s *SegmentBase) error { | |||
func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { | |||
// advance to the chunk where the docValues | |||
// reside for the given docNum | |||
destChunkDataLoc := di.dvDataLoc | |||
for i := 0; i < int(chunkNumber); i++ { | |||
destChunkDataLoc += di.chunkLens[i] | |||
destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc | |||
start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) | |||
if start >= end { | |||
di.curChunkHeader = di.curChunkHeader[:0] | |||
di.curChunkData = nil | |||
di.curChunkNum = chunkNumber | |||
di.uncompressed = di.uncompressed[:0] | |||
return nil | |||
} | |||
curChunkSize := di.chunkLens[chunkNumber] | |||
destChunkDataLoc += start | |||
curChunkEnd += end | |||
// read the number of docs reside in the chunk | |||
numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) | |||
if read <= 0 { | |||
@@ -114,38 +150,81 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, | |||
chunkMetaLoc := destChunkDataLoc + uint64(read) | |||
offset := uint64(0) | |||
di.curChunkHeader = make([]MetaData, int(numDocs)) | |||
if cap(di.curChunkHeader) < int(numDocs) { | |||
di.curChunkHeader = make([]MetaData, int(numDocs)) | |||
} else { | |||
di.curChunkHeader = di.curChunkHeader[:int(numDocs)] | |||
} | |||
for i := 0; i < int(numDocs); i++ { | |||
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) | |||
offset += uint64(read) | |||
di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) | |||
offset += uint64(read) | |||
di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) | |||
di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) | |||
offset += uint64(read) | |||
} | |||
compressedDataLoc := chunkMetaLoc + offset | |||
dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc | |||
dataLength := curChunkEnd - compressedDataLoc | |||
di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] | |||
di.curChunkNum = chunkNumber | |||
di.uncompressed = di.uncompressed[:0] | |||
return nil | |||
} | |||
func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { | |||
for i := 0; i < len(di.chunkOffsets); i++ { | |||
err := di.loadDvChunk(uint64(i), s) | |||
if err != nil { | |||
return err | |||
} | |||
if di.curChunkData == nil || len(di.curChunkHeader) == 0 { | |||
continue | |||
} | |||
// uncompress the already loaded data | |||
uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) | |||
if err != nil { | |||
return err | |||
} | |||
di.uncompressed = uncompressed | |||
start := uint64(0) | |||
for _, entry := range di.curChunkHeader { | |||
err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) | |||
if err != nil { | |||
return err | |||
} | |||
start = entry.DocDvOffset | |||
} | |||
} | |||
return nil | |||
} | |||
func (di *docValueIterator) visitDocValues(docNum uint64, | |||
func (di *docValueReader) visitDocValues(docNum uint64, | |||
visitor index.DocumentFieldTermVisitor) error { | |||
// binary search the term locations for the docNum | |||
start, length := di.getDocValueLocs(docNum) | |||
if start == math.MaxUint64 || length == math.MaxUint64 { | |||
start, end := di.getDocValueLocs(docNum) | |||
if start == math.MaxUint64 || end == math.MaxUint64 || start == end { | |||
return nil | |||
} | |||
// uncompress the already loaded data | |||
uncompressed, err := snappy.Decode(nil, di.curChunkData) | |||
if err != nil { | |||
return err | |||
var uncompressed []byte | |||
var err error | |||
// use the uncompressed copy if available | |||
if len(di.uncompressed) > 0 { | |||
uncompressed = di.uncompressed | |||
} else { | |||
// uncompress the already loaded data | |||
uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) | |||
if err != nil { | |||
return err | |||
} | |||
di.uncompressed = uncompressed | |||
} | |||
// pick the terms for the given docNum | |||
uncompressed = uncompressed[start : start+length] | |||
uncompressed = uncompressed[start:end] | |||
for { | |||
i := bytes.Index(uncompressed, termSeparatorSplitSlice) | |||
if i < 0 { | |||
@@ -159,55 +238,72 @@ func (di *docValueIterator) visitDocValues(docNum uint64, | |||
return nil | |||
} | |||
func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) { | |||
func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { | |||
i := sort.Search(len(di.curChunkHeader), func(i int) bool { | |||
return di.curChunkHeader[i].DocNum >= docNum | |||
}) | |||
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { | |||
return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen | |||
return ReadDocValueBoundary(i, di.curChunkHeader) | |||
} | |||
return math.MaxUint64, math.MaxUint64 | |||
} | |||
// VisitDocumentFieldTerms is an implementation of the | |||
// DocumentFieldTermVisitable interface | |||
func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, | |||
visitor index.DocumentFieldTermVisitor) error { | |||
fieldIDPlus1 := uint16(0) | |||
ok := true | |||
func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, | |||
visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) ( | |||
segment.DocVisitState, error) { | |||
dvs, ok := dvsIn.(*docVisitState) | |||
if !ok || dvs == nil { | |||
dvs = &docVisitState{} | |||
} else { | |||
if dvs.segment != s { | |||
dvs.segment = s | |||
dvs.dvrs = nil | |||
} | |||
} | |||
var fieldIDPlus1 uint16 | |||
if dvs.dvrs == nil { | |||
dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) | |||
for _, field := range fields { | |||
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { | |||
continue | |||
} | |||
fieldID := fieldIDPlus1 - 1 | |||
if dvIter, exists := s.fieldDvReaders[fieldID]; exists && | |||
dvIter != nil { | |||
dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID]) | |||
} | |||
} | |||
} | |||
// find the chunkNumber where the docValues are stored | |||
docInChunk := localDocNum / uint64(s.chunkFactor) | |||
var dvr *docValueReader | |||
for _, field := range fields { | |||
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { | |||
continue | |||
} | |||
// find the chunkNumber where the docValues are stored | |||
docInChunk := localDocNum / uint64(s.chunkFactor) | |||
if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists && | |||
dvIter != nil { | |||
fieldID := fieldIDPlus1 - 1 | |||
if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil { | |||
// check if the chunk is already loaded | |||
if docInChunk != dvIter.curChunkNumber() { | |||
err := dvIter.loadDvChunk(docInChunk, localDocNum, s) | |||
if docInChunk != dvr.curChunkNumber() { | |||
err := dvr.loadDvChunk(docInChunk, &s.SegmentBase) | |||
if err != nil { | |||
continue | |||
return dvs, err | |||
} | |||
} | |||
_ = dvIter.visitDocValues(localDocNum, visitor) | |||
_ = dvr.visitDocValues(localDocNum, visitor) | |||
} | |||
} | |||
return nil | |||
return dvs, nil | |||
} | |||
// VisitableDocValueFields returns the list of fields with | |||
// persisted doc value terms ready to be visitable using the | |||
// VisitDocumentFieldTerms method. | |||
func (s *Segment) VisitableDocValueFields() ([]string, error) { | |||
var rv []string | |||
for fieldID, field := range s.fieldsInv { | |||
if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok && | |||
dvIter != nil { | |||
rv = append(rv, field) | |||
} | |||
} | |||
return rv, nil | |||
return s.fieldDvNames, nil | |||
} |
@@ -46,26 +46,27 @@ func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { | |||
for i, itr := range rv.itrs { | |||
rv.currKs[i], rv.currVs[i] = itr.Current() | |||
} | |||
rv.updateMatches() | |||
if rv.lowK == nil { | |||
rv.updateMatches(false) | |||
if rv.lowK == nil && len(rv.lowIdxs) == 0 { | |||
return rv, vellum.ErrIteratorDone | |||
} | |||
return rv, nil | |||
} | |||
// updateMatches maintains the low key matches based on the currKs | |||
func (m *enumerator) updateMatches() { | |||
func (m *enumerator) updateMatches(skipEmptyKey bool) { | |||
m.lowK = nil | |||
m.lowIdxs = m.lowIdxs[:0] | |||
m.lowCurr = 0 | |||
for i, key := range m.currKs { | |||
if key == nil { | |||
if (key == nil && m.currVs[i] == 0) || // in case of empty iterator | |||
(len(key) == 0 && skipEmptyKey) { // skip empty keys | |||
continue | |||
} | |||
cmp := bytes.Compare(key, m.lowK) | |||
if cmp < 0 || m.lowK == nil { | |||
if cmp < 0 || len(m.lowIdxs) == 0 { | |||
// reached a new low | |||
m.lowK = key | |||
m.lowIdxs = m.lowIdxs[:0] | |||
@@ -102,9 +103,10 @@ func (m *enumerator) Next() error { | |||
} | |||
m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() | |||
} | |||
m.updateMatches() | |||
// can skip any empty keys encountered at this point | |||
m.updateMatches(true) | |||
} | |||
if m.lowK == nil { | |||
if m.lowK == nil && len(m.lowIdxs) == 0 { | |||
return vellum.ErrIteratorDone | |||
} | |||
return nil |
@@ -18,16 +18,12 @@ import ( | |||
"bytes" | |||
"encoding/binary" | |||
"io" | |||
"github.com/Smerity/govarint" | |||
) | |||
type chunkedIntCoder struct { | |||
final []byte | |||
maxDocNum uint64 | |||
chunkSize uint64 | |||
chunkBuf bytes.Buffer | |||
encoder *govarint.Base128Encoder | |||
chunkLens []uint64 | |||
currChunk uint64 | |||
@@ -41,11 +37,9 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { | |||
total := maxDocNum/chunkSize + 1 | |||
rv := &chunkedIntCoder{ | |||
chunkSize: chunkSize, | |||
maxDocNum: maxDocNum, | |||
chunkLens: make([]uint64, total), | |||
final: make([]byte, 0, 64), | |||
} | |||
rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf) | |||
return rv | |||
} | |||
@@ -67,16 +61,18 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { | |||
chunk := docNum / c.chunkSize | |||
if chunk != c.currChunk { | |||
// starting a new chunk | |||
if c.encoder != nil { | |||
// close out last | |||
c.Close() | |||
c.chunkBuf.Reset() | |||
} | |||
c.Close() | |||
c.chunkBuf.Reset() | |||
c.currChunk = chunk | |||
} | |||
if len(c.buf) < binary.MaxVarintLen64 { | |||
c.buf = make([]byte, binary.MaxVarintLen64) | |||
} | |||
for _, val := range vals { | |||
_, err := c.encoder.PutU64(val) | |||
wb := binary.PutUvarint(c.buf, val) | |||
_, err := c.chunkBuf.Write(c.buf[:wb]) | |||
if err != nil { | |||
return err | |||
} | |||
@@ -85,13 +81,26 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { | |||
return nil | |||
} | |||
func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { | |||
chunk := docNum / c.chunkSize | |||
if chunk != c.currChunk { | |||
// starting a new chunk | |||
c.Close() | |||
c.chunkBuf.Reset() | |||
c.currChunk = chunk | |||
} | |||
_, err := c.chunkBuf.Write(buf) | |||
return err | |||
} | |||
// Close indicates you are done calling Add() this allows the final chunk | |||
// to be encoded. | |||
func (c *chunkedIntCoder) Close() { | |||
c.encoder.Close() | |||
encodingBytes := c.chunkBuf.Bytes() | |||
c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) | |||
c.final = append(c.final, encodingBytes...) | |||
c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close | |||
} | |||
// Write commits all the encoded chunked integers to the provided writer. | |||
@@ -102,10 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { | |||
} | |||
buf := c.buf | |||
// write out the number of chunks & each chunkLen | |||
n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) | |||
for _, chunkLen := range c.chunkLens { | |||
n += binary.PutUvarint(buf[n:], uint64(chunkLen)) | |||
// convert the chunk lengths into chunk offsets | |||
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) | |||
// write out the number of chunks & each chunk offsets | |||
n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) | |||
for _, chunkOffset := range chunkOffsets { | |||
n += binary.PutUvarint(buf[n:], chunkOffset) | |||
} | |||
tw, err := w.Write(buf[:n]) | |||
@@ -121,3 +133,40 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { | |||
} | |||
return tw, nil | |||
} | |||
func (c *chunkedIntCoder) FinalSize() int { | |||
return len(c.final) | |||
} | |||
// modifyLengthsToEndOffsets converts the chunk length array | |||
// to a chunk offset array. The readChunkBoundary | |||
// will figure out the start and end of every chunk from | |||
// these offsets. Starting offset of i'th index is stored | |||
// in i-1'th position except for 0'th index and ending offset | |||
// is stored at i'th index position. | |||
// For 0'th element, starting position is always zero. | |||
// eg: | |||
// Lens -> 5 5 5 5 => 5 10 15 20 | |||
// Lens -> 0 5 0 5 => 0 5 5 10 | |||
// Lens -> 0 0 0 5 => 0 0 0 5 | |||
// Lens -> 5 0 0 0 => 5 5 5 5 | |||
// Lens -> 0 5 0 0 => 0 5 5 5 | |||
// Lens -> 0 0 5 0 => 0 0 5 5 | |||
func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { | |||
var runningOffset uint64 | |||
var index, i int | |||
for i = 1; i <= len(lengths); i++ { | |||
runningOffset += lengths[i-1] | |||
lengths[index] = runningOffset | |||
index++ | |||
} | |||
return lengths | |||
} | |||
func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { | |||
var start uint64 | |||
if chunk > 0 { | |||
start = offsets[chunk-1] | |||
} | |||
return start, offsets[chunk] | |||
} |
@@ -24,11 +24,13 @@ import ( | |||
"sort" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/Smerity/govarint" | |||
seg "github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/couchbase/vellum" | |||
"github.com/golang/snappy" | |||
) | |||
var DefaultFileMergerBufferSize = 1024 * 1024 | |||
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc | |||
// Merge takes a slice of zap segments and bit masks describing which | |||
@@ -36,12 +38,24 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc | |||
// remaining data. This new segment is built at the specified path, | |||
// with the provided chunkFactor. | |||
func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, | |||
chunkFactor uint32) ([][]uint64, error) { | |||
chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( | |||
[][]uint64, uint64, error) { | |||
segmentBases := make([]*SegmentBase, len(segments)) | |||
for segmenti, segment := range segments { | |||
segmentBases[segmenti] = &segment.SegmentBase | |||
} | |||
return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh, s) | |||
} | |||
func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, | |||
chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( | |||
[][]uint64, uint64, error) { | |||
flag := os.O_RDWR | os.O_CREATE | |||
f, err := os.OpenFile(path, flag, 0600) | |||
if err != nil { | |||
return nil, err | |||
return nil, 0, err | |||
} | |||
cleanup := func() { | |||
@@ -49,54 +63,49 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, | |||
_ = os.Remove(path) | |||
} | |||
segmentBases := make([]*SegmentBase, len(segments)) | |||
for segmenti, segment := range segments { | |||
segmentBases[segmenti] = &segment.SegmentBase | |||
} | |||
// buffer the output | |||
br := bufio.NewWriter(f) | |||
br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) | |||
// wrap it for counting (tracking offsets) | |||
cr := NewCountHashWriter(br) | |||
cr := NewCountHashWriterWithStatsReporter(br, s) | |||
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := | |||
MergeToWriter(segmentBases, drops, chunkFactor, cr) | |||
MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh) | |||
if err != nil { | |||
cleanup() | |||
return nil, err | |||
return nil, 0, err | |||
} | |||
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, | |||
docValueOffset, chunkFactor, cr.Sum32(), cr) | |||
if err != nil { | |||
cleanup() | |||
return nil, err | |||
return nil, 0, err | |||
} | |||
err = br.Flush() | |||
if err != nil { | |||
cleanup() | |||
return nil, err | |||
return nil, 0, err | |||
} | |||
err = f.Sync() | |||
if err != nil { | |||
cleanup() | |||
return nil, err | |||
return nil, 0, err | |||
} | |||
err = f.Close() | |||
if err != nil { | |||
cleanup() | |||
return nil, err | |||
return nil, 0, err | |||
} | |||
return newDocNums, nil | |||
return newDocNums, uint64(cr.Count()), nil | |||
} | |||
func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
chunkFactor uint32, cr *CountHashWriter) ( | |||
chunkFactor uint32, cr *CountHashWriter, closeCh chan struct{}) ( | |||
newDocNums [][]uint64, | |||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, | |||
dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16, | |||
@@ -108,15 +117,21 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
fieldsMap = mapFields(fieldsInv) | |||
numDocs = computeNewDocCount(segments, drops) | |||
if isClosed(closeCh) { | |||
return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed | |||
} | |||
if numDocs > 0 { | |||
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, | |||
fieldsMap, fieldsInv, fieldsSame, numDocs, cr) | |||
fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh) | |||
if err != nil { | |||
return nil, 0, 0, 0, 0, nil, nil, nil, err | |||
} | |||
dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, | |||
newDocNums, numDocs, chunkFactor, cr) | |||
dictLocs, docValueOffset, err = persistMergedRest(segments, drops, | |||
fieldsInv, fieldsMap, fieldsSame, | |||
newDocNums, numDocs, chunkFactor, cr, closeCh) | |||
if err != nil { | |||
return nil, 0, 0, 0, 0, nil, nil, nil, err | |||
} | |||
@@ -156,11 +171,10 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 | |||
} | |||
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64, | |||
newSegDocCount uint64, chunkFactor uint32, | |||
w *CountHashWriter) ([]uint64, uint64, error) { | |||
fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, | |||
newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32, | |||
w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) { | |||
var bufReuse bytes.Buffer | |||
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) | |||
var bufLoc []uint64 | |||
@@ -168,28 +182,22 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
var postItr *PostingsIterator | |||
rv := make([]uint64, len(fieldsInv)) | |||
fieldDvLocs := make([]uint64, len(fieldsInv)) | |||
fieldDvLocsStart := make([]uint64, len(fieldsInv)) | |||
fieldDvLocsEnd := make([]uint64, len(fieldsInv)) | |||
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) | |||
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) | |||
// docTermMap is keyed by docNum, where the array impl provides | |||
// better memory usage behavior than a sparse-friendlier hashmap | |||
// for when docs have much structural similarity (i.e., every doc | |||
// has a given field) | |||
var docTermMap [][]byte | |||
var vellumBuf bytes.Buffer | |||
newVellum, err := vellum.New(&vellumBuf, nil) | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
newRoaring := roaring.NewBitmap() | |||
// for each field | |||
for fieldID, fieldName := range fieldsInv { | |||
if fieldID != 0 { | |||
vellumBuf.Reset() | |||
} | |||
newVellum, err := vellum.New(&vellumBuf, nil) | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
// collect FST iterators from all active segments for this field | |||
var newDocNums [][]uint64 | |||
@@ -197,7 +205,15 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
var dicts []*Dictionary | |||
var itrs []vellum.Iterator | |||
var segmentsInFocus []*SegmentBase | |||
for segmentI, segment := range segments { | |||
// check for the closure in meantime | |||
if isClosed(closeCh) { | |||
return nil, 0, seg.ErrClosed | |||
} | |||
dict, err2 := segment.dictionary(fieldName) | |||
if err2 != nil { | |||
return nil, 0, err2 | |||
@@ -209,89 +225,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
} | |||
if itr != nil { | |||
newDocNums = append(newDocNums, newDocNumsIn[segmentI]) | |||
drops = append(drops, dropsIn[segmentI]) | |||
if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { | |||
drops = append(drops, dropsIn[segmentI]) | |||
} else { | |||
drops = append(drops, nil) | |||
} | |||
dicts = append(dicts, dict) | |||
itrs = append(itrs, itr) | |||
segmentsInFocus = append(segmentsInFocus, segment) | |||
} | |||
} | |||
} | |||
if uint64(cap(docTermMap)) < newSegDocCount { | |||
docTermMap = make([][]byte, newSegDocCount) | |||
} else { | |||
docTermMap = docTermMap[0:newSegDocCount] | |||
for docNum := range docTermMap { // reset the docTermMap | |||
docTermMap[docNum] = docTermMap[docNum][:0] | |||
} | |||
} | |||
var prevTerm []byte | |||
newRoaring := roaring.NewBitmap() | |||
newRoaringLocs := roaring.NewBitmap() | |||
newRoaring.Clear() | |||
finishTerm := func(term []byte) error { | |||
if term == nil { | |||
return nil | |||
var lastDocNum, lastFreq, lastNorm uint64 | |||
// determines whether to use "1-hit" encoding optimization | |||
// when a term appears in only 1 doc, with no loc info, | |||
// has freq of 1, and the docNum fits into 31-bits | |||
use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) { | |||
if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 { | |||
docNum := uint64(newRoaring.Minimum()) | |||
if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 { | |||
return true, docNum, lastNorm | |||
} | |||
} | |||
return false, 0, 0 | |||
} | |||
finishTerm := func(term []byte) error { | |||
tfEncoder.Close() | |||
locEncoder.Close() | |||
if newRoaring.GetCardinality() > 0 { | |||
// this field/term actually has hits in the new segment, lets write it down | |||
freqOffset := uint64(w.Count()) | |||
_, err := tfEncoder.Write(w) | |||
if err != nil { | |||
return err | |||
} | |||
locOffset := uint64(w.Count()) | |||
_, err = locEncoder.Write(w) | |||
if err != nil { | |||
return err | |||
} | |||
postingLocOffset := uint64(w.Count()) | |||
_, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64) | |||
if err != nil { | |||
return err | |||
} | |||
postingOffset := uint64(w.Count()) | |||
// write out the start of the term info | |||
n := binary.PutUvarint(bufMaxVarintLen64, freqOffset) | |||
_, err = w.Write(bufMaxVarintLen64[:n]) | |||
if err != nil { | |||
return err | |||
} | |||
// write out the start of the loc info | |||
n = binary.PutUvarint(bufMaxVarintLen64, locOffset) | |||
_, err = w.Write(bufMaxVarintLen64[:n]) | |||
if err != nil { | |||
return err | |||
} | |||
// write out the start of the posting locs | |||
n = binary.PutUvarint(bufMaxVarintLen64, postingLocOffset) | |||
_, err = w.Write(bufMaxVarintLen64[:n]) | |||
if err != nil { | |||
return err | |||
} | |||
_, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64) | |||
if err != nil { | |||
return err | |||
} | |||
postingsOffset, err := writePostings(newRoaring, | |||
tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) | |||
if err != nil { | |||
return err | |||
} | |||
err = newVellum.Insert(term, postingOffset) | |||
if postingsOffset > 0 { | |||
err = newVellum.Insert(term, postingsOffset) | |||
if err != nil { | |||
return err | |||
} | |||
} | |||
newRoaring = roaring.NewBitmap() | |||
newRoaringLocs = roaring.NewBitmap() | |||
newRoaring.Clear() | |||
tfEncoder.Reset() | |||
locEncoder.Reset() | |||
lastDocNum = 0 | |||
lastFreq = 0 | |||
lastNorm = 0 | |||
return nil | |||
} | |||
@@ -301,66 +291,39 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
term, itrI, postingsOffset := enumerator.Current() | |||
if !bytes.Equal(prevTerm, term) { | |||
// check for the closure in meantime | |||
if isClosed(closeCh) { | |||
return nil, 0, seg.ErrClosed | |||
} | |||
// if the term changed, write out the info collected | |||
// for the previous term | |||
err2 := finishTerm(prevTerm) | |||
if err2 != nil { | |||
return nil, 0, err2 | |||
err = finishTerm(prevTerm) | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
} | |||
var err2 error | |||
postings, err2 = dicts[itrI].postingsListFromOffset( | |||
postings, err = dicts[itrI].postingsListFromOffset( | |||
postingsOffset, drops[itrI], postings) | |||
if err2 != nil { | |||
return nil, 0, err2 | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
newDocNumsI := newDocNums[itrI] | |||
postItr = postings.iterator(postItr) | |||
next, err2 := postItr.Next() | |||
for next != nil && err2 == nil { | |||
hitNewDocNum := newDocNumsI[next.Number()] | |||
if hitNewDocNum == docDropped { | |||
return nil, 0, fmt.Errorf("see hit with dropped doc num") | |||
} | |||
newRoaring.Add(uint32(hitNewDocNum)) | |||
// encode norm bits | |||
norm := next.Norm() | |||
normBits := math.Float32bits(float32(norm)) | |||
err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
locs := next.Locations() | |||
if len(locs) > 0 { | |||
newRoaringLocs.Add(uint32(hitNewDocNum)) | |||
for _, loc := range locs { | |||
if cap(bufLoc) < 5+len(loc.ArrayPositions()) { | |||
bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) | |||
} | |||
args := bufLoc[0:5] | |||
args[0] = uint64(fieldsMap[loc.Field()] - 1) | |||
args[1] = loc.Pos() | |||
args[2] = loc.Start() | |||
args[3] = loc.End() | |||
args[4] = uint64(len(loc.ArrayPositions())) | |||
args = append(args, loc.ArrayPositions()...) | |||
err = locEncoder.Add(hitNewDocNum, args...) | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
} | |||
} | |||
docTermMap[hitNewDocNum] = | |||
append(append(docTermMap[hitNewDocNum], term...), termSeparator) | |||
next, err2 = postItr.Next() | |||
postItr = postings.iterator(true, true, true, postItr) | |||
if fieldsSame { | |||
// can optimize by copying freq/norm/loc bytes directly | |||
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( | |||
term, postItr, newDocNums[itrI], newRoaring, | |||
tfEncoder, locEncoder) | |||
} else { | |||
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( | |||
fieldsMap, term, postItr, newDocNums[itrI], newRoaring, | |||
tfEncoder, locEncoder, bufLoc) | |||
} | |||
if err2 != nil { | |||
return nil, 0, err2 | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem | |||
@@ -368,7 +331,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
err = enumerator.Next() | |||
} | |||
if err != nil && err != vellum.ErrIteratorDone { | |||
if err != vellum.ErrIteratorDone { | |||
return nil, 0, err | |||
} | |||
@@ -400,26 +363,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
rv[fieldID] = dictOffset | |||
// get the field doc value offset (start) | |||
fieldDvLocsStart[fieldID] = uint64(w.Count()) | |||
// update the field doc values | |||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) | |||
for docNum, docTerms := range docTermMap { | |||
if len(docTerms) > 0 { | |||
err = fdvEncoder.Add(uint64(docNum), docTerms) | |||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true) | |||
fdvReadersAvailable := false | |||
var dvIterClone *docValueReader | |||
for segmentI, segment := range segmentsInFocus { | |||
// check for the closure in meantime | |||
if isClosed(closeCh) { | |||
return nil, 0, seg.ErrClosed | |||
} | |||
fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) | |||
if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && | |||
dvIter != nil { | |||
fdvReadersAvailable = true | |||
dvIterClone = dvIter.cloneInto(dvIterClone) | |||
err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { | |||
if newDocNums[segmentI][docNum] == docDropped { | |||
return nil | |||
} | |||
err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) | |||
if err != nil { | |||
return err | |||
} | |||
return nil | |||
}) | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
} | |||
} | |||
err = fdvEncoder.Close() | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
// get the field doc value offset | |||
fieldDvLocs[fieldID] = uint64(w.Count()) | |||
if fdvReadersAvailable { | |||
err = fdvEncoder.Close() | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
// persist the doc value details for this field | |||
_, err = fdvEncoder.Write() | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
// get the field doc value offset (end) | |||
fieldDvLocsEnd[fieldID] = uint64(w.Count()) | |||
} else { | |||
fieldDvLocsStart[fieldID] = fieldNotUninverted | |||
fieldDvLocsEnd[fieldID] = fieldNotUninverted | |||
} | |||
// persist the doc value details for this field | |||
_, err = fdvEncoder.Write(w) | |||
// reset vellum buffer and vellum builder | |||
vellumBuf.Reset() | |||
err = newVellum.Reset(&vellumBuf) | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
@@ -428,38 +428,210 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
fieldDvLocsOffset := uint64(w.Count()) | |||
buf := bufMaxVarintLen64 | |||
for _, offset := range fieldDvLocs { | |||
n := binary.PutUvarint(buf, uint64(offset)) | |||
for i := 0; i < len(fieldDvLocsStart); i++ { | |||
n := binary.PutUvarint(buf, fieldDvLocsStart[i]) | |||
_, err := w.Write(buf[:n]) | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
n = binary.PutUvarint(buf, fieldDvLocsEnd[i]) | |||
_, err = w.Write(buf[:n]) | |||
if err != nil { | |||
return nil, 0, err | |||
} | |||
} | |||
return rv, fieldDvLocsOffset, nil | |||
} | |||
func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, | |||
newDocNums []uint64, newRoaring *roaring.Bitmap, | |||
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( | |||
lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { | |||
next, err := postItr.Next() | |||
for next != nil && err == nil { | |||
hitNewDocNum := newDocNums[next.Number()] | |||
if hitNewDocNum == docDropped { | |||
return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") | |||
} | |||
newRoaring.Add(uint32(hitNewDocNum)) | |||
nextFreq := next.Frequency() | |||
nextNorm := uint64(math.Float32bits(float32(next.Norm()))) | |||
locs := next.Locations() | |||
err = tfEncoder.Add(hitNewDocNum, | |||
encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) | |||
if err != nil { | |||
return 0, 0, 0, nil, err | |||
} | |||
if len(locs) > 0 { | |||
numBytesLocs := 0 | |||
for _, loc := range locs { | |||
ap := loc.ArrayPositions() | |||
numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), | |||
loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap) | |||
} | |||
err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) | |||
if err != nil { | |||
return 0, 0, 0, nil, err | |||
} | |||
for _, loc := range locs { | |||
ap := loc.ArrayPositions() | |||
if cap(bufLoc) < 5+len(ap) { | |||
bufLoc = make([]uint64, 0, 5+len(ap)) | |||
} | |||
args := bufLoc[0:5] | |||
args[0] = uint64(fieldsMap[loc.Field()] - 1) | |||
args[1] = loc.Pos() | |||
args[2] = loc.Start() | |||
args[3] = loc.End() | |||
args[4] = uint64(len(ap)) | |||
args = append(args, ap...) | |||
err = locEncoder.Add(hitNewDocNum, args...) | |||
if err != nil { | |||
return 0, 0, 0, nil, err | |||
} | |||
} | |||
} | |||
lastDocNum = hitNewDocNum | |||
lastFreq = nextFreq | |||
lastNorm = nextNorm | |||
next, err = postItr.Next() | |||
} | |||
return lastDocNum, lastFreq, lastNorm, bufLoc, err | |||
} | |||
func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, | |||
newDocNums []uint64, newRoaring *roaring.Bitmap, | |||
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) ( | |||
lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { | |||
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := | |||
postItr.nextBytes() | |||
for err == nil && len(nextFreqNormBytes) > 0 { | |||
hitNewDocNum := newDocNums[nextDocNum] | |||
if hitNewDocNum == docDropped { | |||
return 0, 0, 0, fmt.Errorf("see hit with dropped doc num") | |||
} | |||
newRoaring.Add(uint32(hitNewDocNum)) | |||
err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) | |||
if err != nil { | |||
return 0, 0, 0, err | |||
} | |||
if len(nextLocBytes) > 0 { | |||
err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) | |||
if err != nil { | |||
return 0, 0, 0, err | |||
} | |||
} | |||
lastDocNum = hitNewDocNum | |||
lastFreq = nextFreq | |||
lastNorm = nextNorm | |||
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err = | |||
postItr.nextBytes() | |||
} | |||
return lastDocNum, lastFreq, lastNorm, err | |||
} | |||
func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, | |||
use1HitEncoding func(uint64) (bool, uint64, uint64), | |||
w *CountHashWriter, bufMaxVarintLen64 []byte) ( | |||
offset uint64, err error) { | |||
termCardinality := postings.GetCardinality() | |||
if termCardinality <= 0 { | |||
return 0, nil | |||
} | |||
if use1HitEncoding != nil { | |||
encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) | |||
if encodeAs1Hit { | |||
return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil | |||
} | |||
} | |||
tfOffset := uint64(w.Count()) | |||
_, err = tfEncoder.Write(w) | |||
if err != nil { | |||
return 0, err | |||
} | |||
locOffset := uint64(w.Count()) | |||
_, err = locEncoder.Write(w) | |||
if err != nil { | |||
return 0, err | |||
} | |||
postingsOffset := uint64(w.Count()) | |||
n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) | |||
_, err = w.Write(bufMaxVarintLen64[:n]) | |||
if err != nil { | |||
return 0, err | |||
} | |||
n = binary.PutUvarint(bufMaxVarintLen64, locOffset) | |||
_, err = w.Write(bufMaxVarintLen64[:n]) | |||
if err != nil { | |||
return 0, err | |||
} | |||
_, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) | |||
if err != nil { | |||
return 0, err | |||
} | |||
return postingsOffset, nil | |||
} | |||
type varintEncoder func(uint64) (int, error) | |||
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, | |||
w *CountHashWriter) (uint64, [][]uint64, error) { | |||
w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) { | |||
var rv [][]uint64 // The remapped or newDocNums for each segment. | |||
var newDocNum uint64 | |||
var curr int | |||
var metaBuf bytes.Buffer | |||
var data, compressed []byte | |||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) | |||
var metaBuf bytes.Buffer | |||
varBuf := make([]byte, binary.MaxVarintLen64) | |||
metaEncode := func(val uint64) (int, error) { | |||
wb := binary.PutUvarint(varBuf, val) | |||
return metaBuf.Write(varBuf[:wb]) | |||
} | |||
vals := make([][][]byte, len(fieldsInv)) | |||
typs := make([][]byte, len(fieldsInv)) | |||
poss := make([][][]uint64, len(fieldsInv)) | |||
var posBuf []uint64 | |||
docNumOffsets := make([]uint64, newSegDocCount) | |||
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) | |||
defer visitDocumentCtxPool.Put(vdc) | |||
// for each segment | |||
for segI, segment := range segments { | |||
// check for the closure in meantime | |||
if isClosed(closeCh) { | |||
return 0, nil, seg.ErrClosed | |||
} | |||
segNewDocNums := make([]uint64, segment.numDocs) | |||
dropsI := drops[segI] | |||
@@ -495,7 +667,8 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
curr = 0 | |||
metaBuf.Reset() | |||
data = data[:0] | |||
compressed = compressed[:0] | |||
posTemp := posBuf | |||
// collect all the data | |||
for i := 0; i < len(fieldsInv); i++ { | |||
@@ -503,42 +676,63 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
typs[i] = typs[i][:0] | |||
poss[i] = poss[i][:0] | |||
} | |||
err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { | |||
err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool { | |||
fieldID := int(fieldsMap[field]) - 1 | |||
vals[fieldID] = append(vals[fieldID], value) | |||
typs[fieldID] = append(typs[fieldID], typ) | |||
poss[fieldID] = append(poss[fieldID], pos) | |||
// copy array positions to preserve them beyond the scope of this callback | |||
var curPos []uint64 | |||
if len(pos) > 0 { | |||
if cap(posTemp) < len(pos) { | |||
posBuf = make([]uint64, len(pos)*len(fieldsInv)) | |||
posTemp = posBuf | |||
} | |||
curPos = posTemp[0:len(pos)] | |||
copy(curPos, pos) | |||
posTemp = posTemp[len(pos):] | |||
} | |||
poss[fieldID] = append(poss[fieldID], curPos) | |||
return true | |||
}) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// now walk the fields in order | |||
for fieldID := range fieldsInv { | |||
storedFieldValues := vals[int(fieldID)] | |||
// _id field special case optimizes ExternalID() lookups | |||
idFieldVal := vals[uint16(0)][0] | |||
_, err = metaEncode(uint64(len(idFieldVal))) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// now walk the non-"_id" fields in order | |||
for fieldID := 1; fieldID < len(fieldsInv); fieldID++ { | |||
storedFieldValues := vals[fieldID] | |||
stf := typs[int(fieldID)] | |||
spf := poss[int(fieldID)] | |||
stf := typs[fieldID] | |||
spf := poss[fieldID] | |||
var err2 error | |||
curr, data, err2 = persistStoredFieldValues(fieldID, | |||
storedFieldValues, stf, spf, curr, metaEncoder, data) | |||
storedFieldValues, stf, spf, curr, metaEncode, data) | |||
if err2 != nil { | |||
return 0, nil, err2 | |||
} | |||
} | |||
metaEncoder.Close() | |||
metaBytes := metaBuf.Bytes() | |||
compressed = snappy.Encode(compressed, data) | |||
compressed = snappy.Encode(compressed[:cap(compressed)], data) | |||
// record where we're about to start writing | |||
docNumOffsets[newDocNum] = uint64(w.Count()) | |||
// write out the meta len and compressed data len | |||
_, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) | |||
_, err = writeUvarints(w, | |||
uint64(len(metaBytes)), | |||
uint64(len(idFieldVal)+len(compressed))) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
@@ -547,6 +741,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// now write the _id field val (counted as part of the 'compressed' data) | |||
_, err = w.Write(idFieldVal) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// now write the compressed data | |||
_, err = w.Write(compressed) | |||
if err != nil { | |||
@@ -644,3 +843,12 @@ func mergeFields(segments []*SegmentBase) (bool, []string) { | |||
return fieldsSame, rv | |||
} | |||
func isClosed(closeCh chan struct{}) bool { | |||
select { | |||
case <-closeCh: | |||
return true | |||
default: | |||
return false | |||
} | |||
} |
@@ -0,0 +1,826 @@ | |||
// Copyright (c) 2018 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package zap | |||
import ( | |||
"bytes" | |||
"encoding/binary" | |||
"math" | |||
"sort" | |||
"sync" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/couchbase/vellum" | |||
"github.com/golang/snappy" | |||
) | |||
var NewSegmentBufferNumResultsBump int = 100 | |||
var NewSegmentBufferNumResultsFactor float64 = 1.0 | |||
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 | |||
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded | |||
// SegmentBase from analysis results | |||
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, | |||
chunkFactor uint32) (*SegmentBase, uint64, error) { | |||
s := interimPool.Get().(*interim) | |||
var br bytes.Buffer | |||
if s.lastNumDocs > 0 { | |||
// use previous results to initialize the buf with an estimate | |||
// size, but note that the interim instance comes from a | |||
// global interimPool, so multiple scorch instances indexing | |||
// different docs can lead to low quality estimates | |||
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * | |||
NewSegmentBufferNumResultsFactor) | |||
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * | |||
NewSegmentBufferAvgBytesPerDocFactor) | |||
br.Grow(estimateAvgBytesPerDoc * estimateNumResults) | |||
} | |||
s.results = results | |||
s.chunkFactor = chunkFactor | |||
s.w = NewCountHashWriter(&br) | |||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, | |||
err := s.convert() | |||
if err != nil { | |||
return nil, uint64(0), err | |||
} | |||
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor, | |||
s.FieldsMap, s.FieldsInv, uint64(len(results)), | |||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) | |||
if err == nil && s.reset() == nil { | |||
s.lastNumDocs = len(results) | |||
s.lastOutSize = len(br.Bytes()) | |||
interimPool.Put(s) | |||
} | |||
return sb, uint64(len(br.Bytes())), err | |||
} | |||
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} | |||
// interim holds temporary working data used while converting from | |||
// analysis results to a zap-encoded segment | |||
type interim struct { | |||
results []*index.AnalysisResult | |||
chunkFactor uint32 | |||
w *CountHashWriter | |||
// FieldsMap adds 1 to field id to avoid zero value issues | |||
// name -> field id + 1 | |||
FieldsMap map[string]uint16 | |||
// FieldsInv is the inverse of FieldsMap | |||
// field id -> name | |||
FieldsInv []string | |||
// Term dictionaries for each field | |||
// field id -> term -> postings list id + 1 | |||
Dicts []map[string]uint64 | |||
// Terms for each field, where terms are sorted ascending | |||
// field id -> []term | |||
DictKeys [][]string | |||
// Fields whose IncludeDocValues is true | |||
// field id -> bool | |||
IncludeDocValues []bool | |||
// postings id -> bitmap of docNums | |||
Postings []*roaring.Bitmap | |||
// postings id -> freq/norm's, one for each docNum in postings | |||
FreqNorms [][]interimFreqNorm | |||
freqNormsBacking []interimFreqNorm | |||
// postings id -> locs, one for each freq | |||
Locs [][]interimLoc | |||
locsBacking []interimLoc | |||
numTermsPerPostingsList []int // key is postings list id | |||
numLocsPerPostingsList []int // key is postings list id | |||
builder *vellum.Builder | |||
builderBuf bytes.Buffer | |||
metaBuf bytes.Buffer | |||
tmp0 []byte | |||
tmp1 []byte | |||
lastNumDocs int | |||
lastOutSize int | |||
} | |||
func (s *interim) reset() (err error) { | |||
s.results = nil | |||
s.chunkFactor = 0 | |||
s.w = nil | |||
s.FieldsMap = nil | |||
s.FieldsInv = nil | |||
for i := range s.Dicts { | |||
s.Dicts[i] = nil | |||
} | |||
s.Dicts = s.Dicts[:0] | |||
for i := range s.DictKeys { | |||
s.DictKeys[i] = s.DictKeys[i][:0] | |||
} | |||
s.DictKeys = s.DictKeys[:0] | |||
for i := range s.IncludeDocValues { | |||
s.IncludeDocValues[i] = false | |||
} | |||
s.IncludeDocValues = s.IncludeDocValues[:0] | |||
for _, idn := range s.Postings { | |||
idn.Clear() | |||
} | |||
s.Postings = s.Postings[:0] | |||
s.FreqNorms = s.FreqNorms[:0] | |||
for i := range s.freqNormsBacking { | |||
s.freqNormsBacking[i] = interimFreqNorm{} | |||
} | |||
s.freqNormsBacking = s.freqNormsBacking[:0] | |||
s.Locs = s.Locs[:0] | |||
for i := range s.locsBacking { | |||
s.locsBacking[i] = interimLoc{} | |||
} | |||
s.locsBacking = s.locsBacking[:0] | |||
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] | |||
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] | |||
s.builderBuf.Reset() | |||
if s.builder != nil { | |||
err = s.builder.Reset(&s.builderBuf) | |||
} | |||
s.metaBuf.Reset() | |||
s.tmp0 = s.tmp0[:0] | |||
s.tmp1 = s.tmp1[:0] | |||
s.lastNumDocs = 0 | |||
s.lastOutSize = 0 | |||
return err | |||
} | |||
func (s *interim) grabBuf(size int) []byte { | |||
buf := s.tmp0 | |||
if cap(buf) < size { | |||
buf = make([]byte, size) | |||
s.tmp0 = buf | |||
} | |||
return buf[0:size] | |||
} | |||
type interimStoredField struct { | |||
vals [][]byte | |||
typs []byte | |||
arrayposs [][]uint64 // array positions | |||
} | |||
type interimFreqNorm struct { | |||
freq uint64 | |||
norm float32 | |||
numLocs int | |||
} | |||
type interimLoc struct { | |||
fieldID uint16 | |||
pos uint64 | |||
start uint64 | |||
end uint64 | |||
arrayposs []uint64 | |||
} | |||
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { | |||
s.FieldsMap = map[string]uint16{} | |||
s.getOrDefineField("_id") // _id field is fieldID 0 | |||
for _, result := range s.results { | |||
for _, field := range result.Document.CompositeFields { | |||
s.getOrDefineField(field.Name()) | |||
} | |||
for _, field := range result.Document.Fields { | |||
s.getOrDefineField(field.Name()) | |||
} | |||
} | |||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field | |||
for fieldID, fieldName := range s.FieldsInv { | |||
s.FieldsMap[fieldName] = uint16(fieldID + 1) | |||
} | |||
if cap(s.IncludeDocValues) >= len(s.FieldsInv) { | |||
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] | |||
} else { | |||
s.IncludeDocValues = make([]bool, len(s.FieldsInv)) | |||
} | |||
s.prepareDicts() | |||
for _, dict := range s.DictKeys { | |||
sort.Strings(dict) | |||
} | |||
s.processDocuments() | |||
storedIndexOffset, err := s.writeStoredFields() | |||
if err != nil { | |||
return 0, 0, 0, nil, err | |||
} | |||
var fdvIndexOffset uint64 | |||
var dictOffsets []uint64 | |||
if len(s.results) > 0 { | |||
fdvIndexOffset, dictOffsets, err = s.writeDicts() | |||
if err != nil { | |||
return 0, 0, 0, nil, err | |||
} | |||
} else { | |||
dictOffsets = make([]uint64, len(s.FieldsInv)) | |||
} | |||
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) | |||
if err != nil { | |||
return 0, 0, 0, nil, err | |||
} | |||
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil | |||
} | |||
func (s *interim) getOrDefineField(fieldName string) int { | |||
fieldIDPlus1, exists := s.FieldsMap[fieldName] | |||
if !exists { | |||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) | |||
s.FieldsMap[fieldName] = fieldIDPlus1 | |||
s.FieldsInv = append(s.FieldsInv, fieldName) | |||
s.Dicts = append(s.Dicts, make(map[string]uint64)) | |||
n := len(s.DictKeys) | |||
if n < cap(s.DictKeys) { | |||
s.DictKeys = s.DictKeys[:n+1] | |||
s.DictKeys[n] = s.DictKeys[n][:0] | |||
} else { | |||
s.DictKeys = append(s.DictKeys, []string(nil)) | |||
} | |||
} | |||
return int(fieldIDPlus1 - 1) | |||
} | |||
// fill Dicts and DictKeys from analysis results | |||
func (s *interim) prepareDicts() { | |||
var pidNext int | |||
var totTFs int | |||
var totLocs int | |||
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) { | |||
dict := s.Dicts[fieldID] | |||
dictKeys := s.DictKeys[fieldID] | |||
for term, tf := range tfs { | |||
pidPlus1, exists := dict[term] | |||
if !exists { | |||
pidNext++ | |||
pidPlus1 = uint64(pidNext) | |||
dict[term] = pidPlus1 | |||
dictKeys = append(dictKeys, term) | |||
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) | |||
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) | |||
} | |||
pid := pidPlus1 - 1 | |||
s.numTermsPerPostingsList[pid] += 1 | |||
s.numLocsPerPostingsList[pid] += len(tf.Locations) | |||
totLocs += len(tf.Locations) | |||
} | |||
totTFs += len(tfs) | |||
s.DictKeys[fieldID] = dictKeys | |||
} | |||
for _, result := range s.results { | |||
// walk each composite field | |||
for _, field := range result.Document.CompositeFields { | |||
fieldID := uint16(s.getOrDefineField(field.Name())) | |||
_, tf := field.Analyze() | |||
visitField(fieldID, tf) | |||
} | |||
// walk each field | |||
for i, field := range result.Document.Fields { | |||
fieldID := uint16(s.getOrDefineField(field.Name())) | |||
tf := result.Analyzed[i] | |||
visitField(fieldID, tf) | |||
} | |||
} | |||
numPostingsLists := pidNext | |||
if cap(s.Postings) >= numPostingsLists { | |||
s.Postings = s.Postings[:numPostingsLists] | |||
} else { | |||
postings := make([]*roaring.Bitmap, numPostingsLists) | |||
copy(postings, s.Postings[:cap(s.Postings)]) | |||
for i := 0; i < numPostingsLists; i++ { | |||
if postings[i] == nil { | |||
postings[i] = roaring.New() | |||
} | |||
} | |||
s.Postings = postings | |||
} | |||
if cap(s.FreqNorms) >= numPostingsLists { | |||
s.FreqNorms = s.FreqNorms[:numPostingsLists] | |||
} else { | |||
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) | |||
} | |||
if cap(s.freqNormsBacking) >= totTFs { | |||
s.freqNormsBacking = s.freqNormsBacking[:totTFs] | |||
} else { | |||
s.freqNormsBacking = make([]interimFreqNorm, totTFs) | |||
} | |||
freqNormsBacking := s.freqNormsBacking | |||
for pid, numTerms := range s.numTermsPerPostingsList { | |||
s.FreqNorms[pid] = freqNormsBacking[0:0] | |||
freqNormsBacking = freqNormsBacking[numTerms:] | |||
} | |||
if cap(s.Locs) >= numPostingsLists { | |||
s.Locs = s.Locs[:numPostingsLists] | |||
} else { | |||
s.Locs = make([][]interimLoc, numPostingsLists) | |||
} | |||
if cap(s.locsBacking) >= totLocs { | |||
s.locsBacking = s.locsBacking[:totLocs] | |||
} else { | |||
s.locsBacking = make([]interimLoc, totLocs) | |||
} | |||
locsBacking := s.locsBacking | |||
for pid, numLocs := range s.numLocsPerPostingsList { | |||
s.Locs[pid] = locsBacking[0:0] | |||
locsBacking = locsBacking[numLocs:] | |||
} | |||
} | |||
func (s *interim) processDocuments() { | |||
numFields := len(s.FieldsInv) | |||
reuseFieldLens := make([]int, numFields) | |||
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields) | |||
for docNum, result := range s.results { | |||
for i := 0; i < numFields; i++ { // clear these for reuse | |||
reuseFieldLens[i] = 0 | |||
reuseFieldTFs[i] = nil | |||
} | |||
s.processDocument(uint64(docNum), result, | |||
reuseFieldLens, reuseFieldTFs) | |||
} | |||
} | |||
func (s *interim) processDocument(docNum uint64, | |||
result *index.AnalysisResult, | |||
fieldLens []int, fieldTFs []analysis.TokenFrequencies) { | |||
visitField := func(fieldID uint16, fieldName string, | |||
ln int, tf analysis.TokenFrequencies) { | |||
fieldLens[fieldID] += ln | |||
existingFreqs := fieldTFs[fieldID] | |||
if existingFreqs != nil { | |||
existingFreqs.MergeAll(fieldName, tf) | |||
} else { | |||
fieldTFs[fieldID] = tf | |||
} | |||
} | |||
// walk each composite field | |||
for _, field := range result.Document.CompositeFields { | |||
fieldID := uint16(s.getOrDefineField(field.Name())) | |||
ln, tf := field.Analyze() | |||
visitField(fieldID, field.Name(), ln, tf) | |||
} | |||
// walk each field | |||
for i, field := range result.Document.Fields { | |||
fieldID := uint16(s.getOrDefineField(field.Name())) | |||
ln := result.Length[i] | |||
tf := result.Analyzed[i] | |||
visitField(fieldID, field.Name(), ln, tf) | |||
} | |||
// now that it's been rolled up into fieldTFs, walk that | |||
for fieldID, tfs := range fieldTFs { | |||
dict := s.Dicts[fieldID] | |||
norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) | |||
for term, tf := range tfs { | |||
pid := dict[term] - 1 | |||
bs := s.Postings[pid] | |||
bs.Add(uint32(docNum)) | |||
s.FreqNorms[pid] = append(s.FreqNorms[pid], | |||
interimFreqNorm{ | |||
freq: uint64(tf.Frequency()), | |||
norm: norm, | |||
numLocs: len(tf.Locations), | |||
}) | |||
if len(tf.Locations) > 0 { | |||
locs := s.Locs[pid] | |||
for _, loc := range tf.Locations { | |||
var locf = uint16(fieldID) | |||
if loc.Field != "" { | |||
locf = uint16(s.getOrDefineField(loc.Field)) | |||
} | |||
var arrayposs []uint64 | |||
if len(loc.ArrayPositions) > 0 { | |||
arrayposs = loc.ArrayPositions | |||
} | |||
locs = append(locs, interimLoc{ | |||
fieldID: locf, | |||
pos: uint64(loc.Position), | |||
start: uint64(loc.Start), | |||
end: uint64(loc.End), | |||
arrayposs: arrayposs, | |||
}) | |||
} | |||
s.Locs[pid] = locs | |||
} | |||
} | |||
} | |||
} | |||
func (s *interim) writeStoredFields() ( | |||
storedIndexOffset uint64, err error) { | |||
varBuf := make([]byte, binary.MaxVarintLen64) | |||
metaEncode := func(val uint64) (int, error) { | |||
wb := binary.PutUvarint(varBuf, val) | |||
return s.metaBuf.Write(varBuf[:wb]) | |||
} | |||
data, compressed := s.tmp0[:0], s.tmp1[:0] | |||
defer func() { s.tmp0, s.tmp1 = data, compressed }() | |||
// keyed by docNum | |||
docStoredOffsets := make([]uint64, len(s.results)) | |||
// keyed by fieldID, for the current doc in the loop | |||
docStoredFields := map[uint16]interimStoredField{} | |||
for docNum, result := range s.results { | |||
for fieldID := range docStoredFields { // reset for next doc | |||
delete(docStoredFields, fieldID) | |||
} | |||
for _, field := range result.Document.Fields { | |||
fieldID := uint16(s.getOrDefineField(field.Name())) | |||
opts := field.Options() | |||
if opts.IsStored() { | |||
isf := docStoredFields[fieldID] | |||
isf.vals = append(isf.vals, field.Value()) | |||
isf.typs = append(isf.typs, encodeFieldType(field)) | |||
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) | |||
docStoredFields[fieldID] = isf | |||
} | |||
if opts.IncludeDocValues() { | |||
s.IncludeDocValues[fieldID] = true | |||
} | |||
} | |||
var curr int | |||
s.metaBuf.Reset() | |||
data = data[:0] | |||
// _id field special case optimizes ExternalID() lookups | |||
idFieldVal := docStoredFields[uint16(0)].vals[0] | |||
_, err = metaEncode(uint64(len(idFieldVal))) | |||
if err != nil { | |||
return 0, err | |||
} | |||
// handle non-"_id" fields | |||
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { | |||
isf, exists := docStoredFields[uint16(fieldID)] | |||
if exists { | |||
curr, data, err = persistStoredFieldValues( | |||
fieldID, isf.vals, isf.typs, isf.arrayposs, | |||
curr, metaEncode, data) | |||
if err != nil { | |||
return 0, err | |||
} | |||
} | |||
} | |||
metaBytes := s.metaBuf.Bytes() | |||
compressed = snappy.Encode(compressed[:cap(compressed)], data) | |||
docStoredOffsets[docNum] = uint64(s.w.Count()) | |||
_, err := writeUvarints(s.w, | |||
uint64(len(metaBytes)), | |||
uint64(len(idFieldVal)+len(compressed))) | |||
if err != nil { | |||
return 0, err | |||
} | |||
_, err = s.w.Write(metaBytes) | |||
if err != nil { | |||
return 0, err | |||
} | |||
_, err = s.w.Write(idFieldVal) | |||
if err != nil { | |||
return 0, err | |||
} | |||
_, err = s.w.Write(compressed) | |||
if err != nil { | |||
return 0, err | |||
} | |||
} | |||
storedIndexOffset = uint64(s.w.Count()) | |||
for _, docStoredOffset := range docStoredOffsets { | |||
err = binary.Write(s.w, binary.BigEndian, docStoredOffset) | |||
if err != nil { | |||
return 0, err | |||
} | |||
} | |||
return storedIndexOffset, nil | |||
} | |||
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { | |||
dictOffsets = make([]uint64, len(s.FieldsInv)) | |||
fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) | |||
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) | |||
buf := s.grabBuf(binary.MaxVarintLen64) | |||
tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) | |||
locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) | |||
fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false) | |||
var docTermMap [][]byte | |||
if s.builder == nil { | |||
s.builder, err = vellum.New(&s.builderBuf, nil) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
} | |||
for fieldID, terms := range s.DictKeys { | |||
if cap(docTermMap) < len(s.results) { | |||
docTermMap = make([][]byte, len(s.results)) | |||
} else { | |||
docTermMap = docTermMap[0:len(s.results)] | |||
for docNum := range docTermMap { // reset the docTermMap | |||
docTermMap[docNum] = docTermMap[docNum][:0] | |||
} | |||
} | |||
dict := s.Dicts[fieldID] | |||
for _, term := range terms { // terms are already sorted | |||
pid := dict[term] - 1 | |||
postingsBS := s.Postings[pid] | |||
freqNorms := s.FreqNorms[pid] | |||
freqNormOffset := 0 | |||
locs := s.Locs[pid] | |||
locOffset := 0 | |||
postingsItr := postingsBS.Iterator() | |||
for postingsItr.HasNext() { | |||
docNum := uint64(postingsItr.Next()) | |||
freqNorm := freqNorms[freqNormOffset] | |||
err = tfEncoder.Add(docNum, | |||
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), | |||
uint64(math.Float32bits(freqNorm.norm))) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
if freqNorm.numLocs > 0 { | |||
numBytesLocs := 0 | |||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { | |||
numBytesLocs += totalUvarintBytes( | |||
uint64(loc.fieldID), loc.pos, loc.start, loc.end, | |||
uint64(len(loc.arrayposs)), loc.arrayposs) | |||
} | |||
err = locEncoder.Add(docNum, uint64(numBytesLocs)) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { | |||
err = locEncoder.Add(docNum, | |||
uint64(loc.fieldID), loc.pos, loc.start, loc.end, | |||
uint64(len(loc.arrayposs))) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
err = locEncoder.Add(docNum, loc.arrayposs...) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
} | |||
locOffset += freqNorm.numLocs | |||
} | |||
freqNormOffset++ | |||
docTermMap[docNum] = append( | |||
append(docTermMap[docNum], term...), | |||
termSeparator) | |||
} | |||
tfEncoder.Close() | |||
locEncoder.Close() | |||
postingsOffset, err := | |||
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
if postingsOffset > uint64(0) { | |||
err = s.builder.Insert([]byte(term), postingsOffset) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
} | |||
tfEncoder.Reset() | |||
locEncoder.Reset() | |||
} | |||
err = s.builder.Close() | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// record where this dictionary starts | |||
dictOffsets[fieldID] = uint64(s.w.Count()) | |||
vellumData := s.builderBuf.Bytes() | |||
// write out the length of the vellum data | |||
n := binary.PutUvarint(buf, uint64(len(vellumData))) | |||
_, err = s.w.Write(buf[:n]) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// write this vellum to disk | |||
_, err = s.w.Write(vellumData) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// reset vellum for reuse | |||
s.builderBuf.Reset() | |||
err = s.builder.Reset(&s.builderBuf) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
// write the field doc values | |||
if s.IncludeDocValues[fieldID] { | |||
for docNum, docTerms := range docTermMap { | |||
if len(docTerms) > 0 { | |||
err = fdvEncoder.Add(uint64(docNum), docTerms) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
} | |||
} | |||
err = fdvEncoder.Close() | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
fdvOffsetsStart[fieldID] = uint64(s.w.Count()) | |||
_, err = fdvEncoder.Write() | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) | |||
fdvEncoder.Reset() | |||
} else { | |||
fdvOffsetsStart[fieldID] = fieldNotUninverted | |||
fdvOffsetsEnd[fieldID] = fieldNotUninverted | |||
} | |||
} | |||
fdvIndexOffset = uint64(s.w.Count()) | |||
for i := 0; i < len(fdvOffsetsStart); i++ { | |||
n := binary.PutUvarint(buf, fdvOffsetsStart[i]) | |||
_, err := s.w.Write(buf[:n]) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) | |||
_, err = s.w.Write(buf[:n]) | |||
if err != nil { | |||
return 0, nil, err | |||
} | |||
} | |||
return fdvIndexOffset, dictOffsets, nil | |||
} | |||
func encodeFieldType(f document.Field) byte { | |||
fieldType := byte('x') | |||
switch f.(type) { | |||
case *document.TextField: | |||
fieldType = 't' | |||
case *document.NumericField: | |||
fieldType = 'n' | |||
case *document.DateTimeField: | |||
fieldType = 'd' | |||
case *document.BooleanField: | |||
fieldType = 'b' | |||
case *document.GeoPointField: | |||
fieldType = 'g' | |||
case *document.CompositeField: | |||
fieldType = 'c' | |||
} | |||
return fieldType | |||
} | |||
// returns the total # of bytes needed to encode the given uint64's | |||
// into binary.PutUVarint() encoding | |||
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { | |||
n = numUvarintBytes(a) | |||
n += numUvarintBytes(b) | |||
n += numUvarintBytes(c) | |||
n += numUvarintBytes(d) | |||
n += numUvarintBytes(e) | |||
for _, v := range more { | |||
n += numUvarintBytes(v) | |||
} | |||
return n | |||
} | |||
// returns # of bytes needed to encode x in binary.PutUvarint() encoding | |||
func numUvarintBytes(x uint64) (n int) { | |||
for x >= 0x80 { | |||
x >>= 7 | |||
n++ | |||
} | |||
return n + 1 | |||
} |
@@ -20,16 +20,24 @@ import ( | |||
"fmt" | |||
"io" | |||
"os" | |||
"reflect" | |||
"sync" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/Smerity/govarint" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/blevesearch/bleve/size" | |||
"github.com/couchbase/vellum" | |||
mmap "github.com/edsrzf/mmap-go" | |||
"github.com/golang/snappy" | |||
) | |||
var reflectStaticSizeSegmentBase int | |||
func init() { | |||
var sb SegmentBase | |||
reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size()) | |||
} | |||
// Open returns a zap impl of a segment | |||
func Open(path string) (segment.Segment, error) { | |||
f, err := os.Open(path) | |||
@@ -47,13 +55,14 @@ func Open(path string) (segment.Segment, error) { | |||
SegmentBase: SegmentBase{ | |||
mem: mm[0 : len(mm)-FooterSize], | |||
fieldsMap: make(map[string]uint16), | |||
fieldDvIterMap: make(map[uint16]*docValueIterator), | |||
fieldDvReaders: make(map[uint16]*docValueReader), | |||
}, | |||
f: f, | |||
mm: mm, | |||
path: path, | |||
refs: 1, | |||
} | |||
rv.SegmentBase.updateSize() | |||
err = rv.loadConfig() | |||
if err != nil { | |||
@@ -67,7 +76,7 @@ func Open(path string) (segment.Segment, error) { | |||
return nil, err | |||
} | |||
err = rv.loadDvIterators() | |||
err = rv.loadDvReaders() | |||
if err != nil { | |||
_ = rv.Close() | |||
return nil, err | |||
@@ -89,7 +98,39 @@ type SegmentBase struct { | |||
fieldsIndexOffset uint64 | |||
docValueOffset uint64 | |||
dictLocs []uint64 | |||
fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field | |||
fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field | |||
fieldDvNames []string // field names cached in fieldDvReaders | |||
size uint64 | |||
} | |||
func (sb *SegmentBase) Size() int { | |||
return int(sb.size) | |||
} | |||
func (sb *SegmentBase) updateSize() { | |||
sizeInBytes := reflectStaticSizeSegmentBase + | |||
cap(sb.mem) | |||
// fieldsMap | |||
for k, _ := range sb.fieldsMap { | |||
sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16 | |||
} | |||
// fieldsInv, dictLocs | |||
for _, entry := range sb.fieldsInv { | |||
sizeInBytes += len(entry) + size.SizeOfString | |||
} | |||
sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64 | |||
// fieldDvReaders | |||
for _, v := range sb.fieldDvReaders { | |||
sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr | |||
if v != nil { | |||
sizeInBytes += v.size() | |||
} | |||
} | |||
sb.size = uint64(sizeInBytes) | |||
} | |||
func (sb *SegmentBase) AddRef() {} | |||
@@ -111,56 +152,19 @@ type Segment struct { | |||
refs int64 | |||
} | |||
func (s *Segment) SizeInBytes() uint64 { | |||
func (s *Segment) Size() int { | |||
// 8 /* size of file pointer */ | |||
// 4 /* size of version -> uint32 */ | |||
// 4 /* size of crc -> uint32 */ | |||
sizeOfUints := 16 | |||
sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints | |||
sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints | |||
// mutex, refs -> int64 | |||
sizeInBytes += 16 | |||
// do not include the mmap'ed part | |||
return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem)) | |||
} | |||
func (s *SegmentBase) SizeInBytes() uint64 { | |||
// 4 /* size of memCRC -> uint32 */ | |||
// 4 /* size of chunkFactor -> uint32 */ | |||
// 8 /* size of numDocs -> uint64 */ | |||
// 8 /* size of storedIndexOffset -> uint64 */ | |||
// 8 /* size of fieldsIndexOffset -> uint64 */ | |||
// 8 /* size of docValueOffset -> uint64 */ | |||
sizeInBytes := 40 | |||
sizeInBytes += len(s.mem) + int(segment.SizeOfSlice) | |||
// fieldsMap | |||
for k, _ := range s.fieldsMap { | |||
sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */ | |||
} | |||
sizeInBytes += int(segment.SizeOfMap) /* overhead from map */ | |||
// fieldsInv, dictLocs | |||
for _, entry := range s.fieldsInv { | |||
sizeInBytes += (len(entry) + int(segment.SizeOfString)) | |||
} | |||
sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */ | |||
sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */ | |||
// fieldDvIterMap | |||
sizeInBytes += len(s.fieldDvIterMap) * | |||
int(segment.SizeOfPointer+2 /* size of uint16 */) | |||
for _, entry := range s.fieldDvIterMap { | |||
if entry != nil { | |||
sizeInBytes += int(entry.sizeInBytes()) | |||
} | |||
} | |||
sizeInBytes += int(segment.SizeOfMap) | |||
return uint64(sizeInBytes) | |||
return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) | |||
} | |||
func (s *Segment) AddRef() { | |||
@@ -185,7 +189,7 @@ func (s *Segment) loadConfig() error { | |||
verOffset := crcOffset - 4 | |||
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) | |||
if s.version != version { | |||
if s.version != Version { | |||
return fmt.Errorf("unsupported version %d", s.version) | |||
} | |||
@@ -207,7 +211,7 @@ func (s *Segment) loadConfig() error { | |||
} | |||
func (s *SegmentBase) loadFields() error { | |||
// NOTE for now we assume the fields index immediately preceeds | |||
// NOTE for now we assume the fields index immediately precedes | |||
// the footer, and if this changes, need to adjust accordingly (or | |||
// store explicit length), where s.mem was sliced from s.mm in Open(). | |||
fieldsIndexEnd := uint64(len(s.mem)) | |||
@@ -262,6 +266,10 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { | |||
if err != nil { | |||
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) | |||
} | |||
rv.fstReader, err = rv.fst.Reader() | |||
if err != nil { | |||
return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) | |||
} | |||
} | |||
} | |||
} | |||
@@ -269,50 +277,90 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { | |||
return rv, nil | |||
} | |||
// visitDocumentCtx holds data structures that are reusable across | |||
// multiple VisitDocument() calls to avoid memory allocations | |||
type visitDocumentCtx struct { | |||
buf []byte | |||
reader bytes.Reader | |||
arrayPos []uint64 | |||
} | |||
var visitDocumentCtxPool = sync.Pool{ | |||
New: func() interface{} { | |||
reuse := &visitDocumentCtx{} | |||
return reuse | |||
}, | |||
} | |||
// VisitDocument invokes the DocFieldValueVistor for each stored field | |||
// for the specified doc number | |||
func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { | |||
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) | |||
defer visitDocumentCtxPool.Put(vdc) | |||
return s.visitDocument(vdc, num, visitor) | |||
} | |||
func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64, | |||
visitor segment.DocumentFieldValueVisitor) error { | |||
// first make sure this is a valid number in this segment | |||
if num < s.numDocs { | |||
meta, compressed := s.getDocStoredMetaAndCompressed(num) | |||
uncompressed, err := snappy.Decode(nil, compressed) | |||
vdc.reader.Reset(meta) | |||
// handle _id field special case | |||
idFieldValLen, err := binary.ReadUvarint(&vdc.reader) | |||
if err != nil { | |||
return err | |||
} | |||
idFieldVal := compressed[:idFieldValLen] | |||
keepGoing := visitor("_id", byte('t'), idFieldVal, nil) | |||
if !keepGoing { | |||
visitDocumentCtxPool.Put(vdc) | |||
return nil | |||
} | |||
// handle non-"_id" fields | |||
compressed = compressed[idFieldValLen:] | |||
uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) | |||
if err != nil { | |||
return err | |||
} | |||
// now decode meta and process | |||
reader := bytes.NewReader(meta) | |||
decoder := govarint.NewU64Base128Decoder(reader) | |||
keepGoing := true | |||
for keepGoing { | |||
field, err := decoder.GetU64() | |||
field, err := binary.ReadUvarint(&vdc.reader) | |||
if err == io.EOF { | |||
break | |||
} | |||
if err != nil { | |||
return err | |||
} | |||
typ, err := decoder.GetU64() | |||
typ, err := binary.ReadUvarint(&vdc.reader) | |||
if err != nil { | |||
return err | |||
} | |||
offset, err := decoder.GetU64() | |||
offset, err := binary.ReadUvarint(&vdc.reader) | |||
if err != nil { | |||
return err | |||
} | |||
l, err := decoder.GetU64() | |||
l, err := binary.ReadUvarint(&vdc.reader) | |||
if err != nil { | |||
return err | |||
} | |||
numap, err := decoder.GetU64() | |||
numap, err := binary.ReadUvarint(&vdc.reader) | |||
if err != nil { | |||
return err | |||
} | |||
var arrayPos []uint64 | |||
if numap > 0 { | |||
arrayPos = make([]uint64, numap) | |||
if cap(vdc.arrayPos) < int(numap) { | |||
vdc.arrayPos = make([]uint64, numap) | |||
} | |||
arrayPos = vdc.arrayPos[:numap] | |||
for i := 0; i < int(numap); i++ { | |||
ap, err := decoder.GetU64() | |||
ap, err := binary.ReadUvarint(&vdc.reader) | |||
if err != nil { | |||
return err | |||
} | |||
@@ -323,10 +371,36 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal | |||
value := uncompressed[offset : offset+l] | |||
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) | |||
} | |||
vdc.buf = uncompressed | |||
} | |||
return nil | |||
} | |||
// DocID returns the value of the _id field for the given docNum | |||
func (s *SegmentBase) DocID(num uint64) ([]byte, error) { | |||
if num >= s.numDocs { | |||
return nil, nil | |||
} | |||
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) | |||
meta, compressed := s.getDocStoredMetaAndCompressed(num) | |||
vdc.reader.Reset(meta) | |||
// handle _id field special case | |||
idFieldValLen, err := binary.ReadUvarint(&vdc.reader) | |||
if err != nil { | |||
return nil, err | |||
} | |||
idFieldVal := compressed[:idFieldValLen] | |||
visitDocumentCtxPool.Put(vdc) | |||
return idFieldVal, nil | |||
} | |||
// Count returns the number of documents in this segment. | |||
func (s *SegmentBase) Count() uint64 { | |||
return s.numDocs | |||
@@ -343,15 +417,26 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { | |||
return nil, err | |||
} | |||
var postings *PostingsList | |||
postingsList := emptyPostingsList | |||
sMax, err := idDict.fst.GetMaxKey() | |||
if err != nil { | |||
return nil, err | |||
} | |||
sMaxStr := string(sMax) | |||
filteredIds := make([]string, 0, len(ids)) | |||
for _, id := range ids { | |||
postings, err = idDict.postingsList([]byte(id), nil, postings) | |||
if id <= sMaxStr { | |||
filteredIds = append(filteredIds, id) | |||
} | |||
} | |||
for _, id := range filteredIds { | |||
postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) | |||
if err != nil { | |||
return nil, err | |||
} | |||
if postings.postings != nil { | |||
rv.Or(postings.postings) | |||
} | |||
postingsList.OrInto(rv) | |||
} | |||
} | |||
@@ -441,19 +526,32 @@ func (s *Segment) DictAddr(field string) (uint64, error) { | |||
return s.dictLocs[fieldIDPlus1-1], nil | |||
} | |||
func (s *SegmentBase) loadDvIterators() error { | |||
func (s *SegmentBase) loadDvReaders() error { | |||
if s.docValueOffset == fieldNotUninverted { | |||
return nil | |||
} | |||
var read uint64 | |||
for fieldID, field := range s.fieldsInv { | |||
fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) | |||
var fieldLocStart, fieldLocEnd uint64 | |||
var n int | |||
fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) | |||
if n <= 0 { | |||
return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) | |||
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) | |||
} | |||
s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc) | |||
read += uint64(n) | |||
fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) | |||
if n <= 0 { | |||
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) | |||
} | |||
read += uint64(n) | |||
fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) | |||
if fieldDvReader != nil { | |||
s.fieldDvReaders[uint16(fieldID)] = fieldDvReader | |||
s.fieldDvNames = append(s.fieldDvNames, field) | |||
} | |||
} | |||
return nil | |||
} |
@@ -15,7 +15,6 @@ | |||
package zap | |||
import ( | |||
"bytes" | |||
"encoding/binary" | |||
"io" | |||
@@ -25,28 +24,29 @@ import ( | |||
// writes out the length of the roaring bitmap in bytes as varint | |||
// then writes out the roaring bitmap itself | |||
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, | |||
reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) { | |||
reuseBuf.Reset() | |||
// write out postings list to memory so we know the len | |||
postingsListLen, err := r.WriteTo(reuseBuf) | |||
reuseBufVarint []byte) (int, error) { | |||
buf, err := r.ToBytes() | |||
if err != nil { | |||
return 0, err | |||
} | |||
var tw int | |||
// write out the length of this postings list | |||
n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen)) | |||
// write out the length | |||
n := binary.PutUvarint(reuseBufVarint, uint64(len(buf))) | |||
nw, err := w.Write(reuseBufVarint[:n]) | |||
tw += nw | |||
if err != nil { | |||
return tw, err | |||
} | |||
// write out the postings list itself | |||
nw, err = w.Write(reuseBuf.Bytes()) | |||
// write out the roaring bytes | |||
nw, err = w.Write(buf) | |||
tw += nw | |||
if err != nil { | |||
return tw, err | |||
} | |||
return tw, nil | |||
} | |||
@@ -118,7 +118,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset | |||
return err | |||
} | |||
// write out 32-bit version | |||
err = binary.Write(w, binary.BigEndian, version) | |||
err = binary.Write(w, binary.BigEndian, Version) | |||
if err != nil { | |||
return err | |||
} |
@@ -15,10 +15,10 @@ | |||
package scorch | |||
import ( | |||
"bytes" | |||
"container/heap" | |||
"encoding/binary" | |||
"fmt" | |||
"reflect" | |||
"sort" | |||
"sync" | |||
"sync/atomic" | |||
@@ -27,8 +27,13 @@ import ( | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/couchbase/vellum" | |||
lev2 "github.com/couchbase/vellum/levenshtein2" | |||
) | |||
// re usable, threadsafe levenshtein builders | |||
var lb1, lb2 *lev2.LevenshteinAutomatonBuilder | |||
type asynchSegmentResult struct { | |||
dictItr segment.DictionaryIterator | |||
@@ -40,15 +45,36 @@ type asynchSegmentResult struct { | |||
err error | |||
} | |||
var reflectStaticSizeIndexSnapshot int | |||
func init() { | |||
var is interface{} = IndexSnapshot{} | |||
reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size()) | |||
var err error | |||
lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true) | |||
if err != nil { | |||
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err)) | |||
} | |||
lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true) | |||
if err != nil { | |||
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err)) | |||
} | |||
} | |||
type IndexSnapshot struct { | |||
parent *Scorch | |||
segment []*SegmentSnapshot | |||
offsets []uint64 | |||
internal map[string][]byte | |||
epoch uint64 | |||
size uint64 | |||
creator string | |||
m sync.Mutex // Protects the fields that follow. | |||
refs int64 | |||
m2 sync.Mutex // Protects the fields that follow. | |||
fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's | |||
} | |||
func (i *IndexSnapshot) Segments() []*SegmentSnapshot { | |||
@@ -85,12 +111,27 @@ func (i *IndexSnapshot) DecRef() (err error) { | |||
return err | |||
} | |||
func (i *IndexSnapshot) Close() error { | |||
return i.DecRef() | |||
} | |||
func (i *IndexSnapshot) Size() int { | |||
return int(i.size) | |||
} | |||
func (i *IndexSnapshot) updateSize() { | |||
i.size += uint64(reflectStaticSizeIndexSnapshot) | |||
for _, s := range i.segment { | |||
i.size += uint64(s.Size()) | |||
} | |||
} | |||
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { | |||
results := make(chan *asynchSegmentResult) | |||
for index, segment := range i.segment { | |||
go func(index int, segment *SegmentSnapshot) { | |||
dict, err := segment.Dictionary(field) | |||
dict, err := segment.segment.Dictionary(field) | |||
if err != nil { | |||
results <- &asynchSegmentResult{err: err} | |||
} else { | |||
@@ -116,7 +157,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s | |||
if next != nil { | |||
rv.cursors = append(rv.cursors, &segmentDictCursor{ | |||
itr: asr.dictItr, | |||
curr: next, | |||
curr: *next, | |||
}) | |||
} | |||
} | |||
@@ -151,6 +192,56 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, | |||
}) | |||
} | |||
func (i *IndexSnapshot) FieldDictRegexp(field string, | |||
termRegex string) (index.FieldDict, error) { | |||
// TODO: potential optimization where the literal prefix represents the, | |||
// entire regexp, allowing us to use PrefixIterator(prefixTerm)? | |||
a, prefixBeg, prefixEnd, err := segment.ParseRegexp(termRegex) | |||
if err != nil { | |||
return nil, err | |||
} | |||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { | |||
return i.AutomatonIterator(a, prefixBeg, prefixEnd) | |||
}) | |||
} | |||
func (i *IndexSnapshot) getLevAutomaton(term string, | |||
fuzziness uint8) (vellum.Automaton, error) { | |||
if fuzziness == 1 { | |||
return lb1.BuildDfa(term, fuzziness) | |||
} else if fuzziness == 2 { | |||
return lb2.BuildDfa(term, fuzziness) | |||
} | |||
return nil, fmt.Errorf("fuzziness exceeds the max limit") | |||
} | |||
func (i *IndexSnapshot) FieldDictFuzzy(field string, | |||
term string, fuzziness int, prefix string) (index.FieldDict, error) { | |||
a, err := i.getLevAutomaton(term, uint8(fuzziness)) | |||
if err != nil { | |||
return nil, err | |||
} | |||
var prefixBeg, prefixEnd []byte | |||
if prefix != "" { | |||
prefixBeg = []byte(prefix) | |||
prefixEnd = segment.IncrementBytes(prefixBeg) | |||
} | |||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { | |||
return i.AutomatonIterator(a, prefixBeg, prefixEnd) | |||
}) | |||
} | |||
func (i *IndexSnapshot) FieldDictOnly(field string, | |||
onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) { | |||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { | |||
return i.OnlyIterator(onlyTerms, includeCount) | |||
}) | |||
} | |||
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { | |||
results := make(chan *asynchSegmentResult) | |||
for index, segment := range i.segment { | |||
@@ -264,21 +355,26 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { | |||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) | |||
rv = document.NewDocument(id) | |||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool { | |||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, val []byte, pos []uint64) bool { | |||
if name == "_id" { | |||
return true | |||
} | |||
// copy value, array positions to preserve them beyond the scope of this callback | |||
value := append([]byte(nil), val...) | |||
arrayPos := append([]uint64(nil), pos...) | |||
switch typ { | |||
case 't': | |||
rv.AddField(document.NewTextField(name, pos, value)) | |||
rv.AddField(document.NewTextField(name, arrayPos, value)) | |||
case 'n': | |||
rv.AddField(document.NewNumericFieldFromBytes(name, pos, value)) | |||
rv.AddField(document.NewNumericFieldFromBytes(name, arrayPos, value)) | |||
case 'd': | |||
rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value)) | |||
rv.AddField(document.NewDateTimeFieldFromBytes(name, arrayPos, value)) | |||
case 'b': | |||
rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value)) | |||
rv.AddField(document.NewBooleanFieldFromBytes(name, arrayPos, value)) | |||
case 'g': | |||
rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value)) | |||
rv.AddField(document.NewGeoPointFieldFromBytes(name, arrayPos, value)) | |||
} | |||
return true | |||
@@ -307,24 +403,15 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { | |||
} | |||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) | |||
var found bool | |||
var rv string | |||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool { | |||
if field == "_id" { | |||
found = true | |||
rv = string(value) | |||
return false | |||
} | |||
return true | |||
}) | |||
v, err := i.segment[segmentIndex].DocID(localDocNum) | |||
if err != nil { | |||
return "", err | |||
} | |||
if found { | |||
return rv, nil | |||
if v == nil { | |||
return "", fmt.Errorf("document number %d not found", docNum) | |||
} | |||
return "", fmt.Errorf("document number %d not found", docNum) | |||
return string(v), nil | |||
} | |||
func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) { | |||
@@ -349,33 +436,81 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err | |||
func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, | |||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { | |||
rv := &IndexSnapshotTermFieldReader{ | |||
term: term, | |||
field: field, | |||
snapshot: i, | |||
postings: make([]segment.PostingsList, len(i.segment)), | |||
iterators: make([]segment.PostingsIterator, len(i.segment)), | |||
includeFreq: includeFreq, | |||
includeNorm: includeNorm, | |||
includeTermVectors: includeTermVectors, | |||
rv := i.allocTermFieldReaderDicts(field) | |||
rv.term = term | |||
rv.field = field | |||
rv.snapshot = i | |||
if rv.postings == nil { | |||
rv.postings = make([]segment.PostingsList, len(i.segment)) | |||
} | |||
if rv.iterators == nil { | |||
rv.iterators = make([]segment.PostingsIterator, len(i.segment)) | |||
} | |||
rv.segmentOffset = 0 | |||
rv.includeFreq = includeFreq | |||
rv.includeNorm = includeNorm | |||
rv.includeTermVectors = includeTermVectors | |||
rv.currPosting = nil | |||
rv.currID = rv.currID[:0] | |||
if rv.dicts == nil { | |||
rv.dicts = make([]segment.TermDictionary, len(i.segment)) | |||
for i, segment := range i.segment { | |||
dict, err := segment.segment.Dictionary(field) | |||
if err != nil { | |||
return nil, err | |||
} | |||
rv.dicts[i] = dict | |||
} | |||
} | |||
for i, segment := range i.segment { | |||
dict, err := segment.Dictionary(field) | |||
if err != nil { | |||
return nil, err | |||
} | |||
pl, err := dict.PostingsList(string(term), nil) | |||
pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i]) | |||
if err != nil { | |||
return nil, err | |||
} | |||
rv.postings[i] = pl | |||
rv.iterators[i] = pl.Iterator() | |||
rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i]) | |||
} | |||
atomic.AddUint64(&i.parent.stats.termSearchersStarted, uint64(1)) | |||
atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1)) | |||
return rv, nil | |||
} | |||
func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnapshotTermFieldReader) { | |||
i.m2.Lock() | |||
if i.fieldTFRs != nil { | |||
tfrs := i.fieldTFRs[field] | |||
last := len(tfrs) - 1 | |||
if last >= 0 { | |||
tfr = tfrs[last] | |||
tfrs[last] = nil | |||
i.fieldTFRs[field] = tfrs[:last] | |||
i.m2.Unlock() | |||
return | |||
} | |||
} | |||
i.m2.Unlock() | |||
return &IndexSnapshotTermFieldReader{} | |||
} | |||
func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { | |||
i.parent.rootLock.RLock() | |||
obsolete := i.parent.root != i | |||
i.parent.rootLock.RUnlock() | |||
if obsolete { | |||
// if we're not the current root (mutations happened), don't bother recycling | |||
return | |||
} | |||
i.m2.Lock() | |||
if i.fieldTFRs == nil { | |||
i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{} | |||
} | |||
i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr) | |||
i.m2.Unlock() | |||
} | |||
func docNumberToBytes(buf []byte, in uint64) []byte { | |||
if len(buf) != 8 { | |||
if cap(buf) >= 8 { | |||
@@ -389,115 +524,172 @@ func docNumberToBytes(buf []byte, in uint64) []byte { | |||
} | |||
func docInternalToNumber(in index.IndexInternalID) (uint64, error) { | |||
var res uint64 | |||
err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res) | |||
if err != nil { | |||
return 0, err | |||
if len(in) != 8 { | |||
return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in) | |||
} | |||
return res, nil | |||
return binary.BigEndian.Uint64(in), nil | |||
} | |||
func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, | |||
fields []string, visitor index.DocumentFieldTermVisitor) error { | |||
_, err := i.documentVisitFieldTerms(id, fields, visitor, nil) | |||
return err | |||
} | |||
func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, | |||
fields []string, visitor index.DocumentFieldTermVisitor, | |||
dvs segment.DocVisitState) (segment.DocVisitState, error) { | |||
docNum, err := docInternalToNumber(id) | |||
if err != nil { | |||
return err | |||
return nil, err | |||
} | |||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) | |||
if segmentIndex >= len(i.segment) { | |||
return nil | |||
return nil, nil | |||
} | |||
_, dvs, err = i.documentVisitFieldTermsOnSegment( | |||
segmentIndex, localDocNum, fields, nil, visitor, dvs) | |||
return dvs, err | |||
} | |||
func (i *IndexSnapshot) documentVisitFieldTermsOnSegment( | |||
segmentIndex int, localDocNum uint64, fields []string, cFields []string, | |||
visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) ( | |||
cFieldsOut []string, dvsOut segment.DocVisitState, err error) { | |||
ss := i.segment[segmentIndex] | |||
if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok { | |||
// get the list of doc value persisted fields | |||
pFields, err := zaps.VisitableDocValueFields() | |||
var vFields []string // fields that are visitable via the segment | |||
ssv, ssvOk := ss.segment.(segment.DocumentFieldTermVisitable) | |||
if ssvOk && ssv != nil { | |||
vFields, err = ssv.VisitableDocValueFields() | |||
if err != nil { | |||
return err | |||
} | |||
// assort the fields for which terms look up have to | |||
// be performed runtime | |||
dvPendingFields := extractDvPendingFields(fields, pFields) | |||
if len(dvPendingFields) == 0 { | |||
// all fields are doc value persisted | |||
return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) | |||
return nil, nil, err | |||
} | |||
} | |||
// concurrently trigger the runtime doc value preparations for | |||
// pending fields as well as the visit of the persisted doc values | |||
errCh := make(chan error, 1) | |||
var errCh chan error | |||
go func() { | |||
defer close(errCh) | |||
err := ss.cachedDocs.prepareFields(fields, ss) | |||
if err != nil { | |||
errCh <- err | |||
} | |||
}() | |||
// cFields represents the fields that we'll need from the | |||
// cachedDocs, and might be optionally be provided by the caller, | |||
// if the caller happens to know we're on the same segmentIndex | |||
// from a previous invocation | |||
if cFields == nil { | |||
cFields = subtractStrings(fields, vFields) | |||
if !ss.cachedDocs.hasFields(cFields) { | |||
errCh = make(chan error, 1) | |||
go func() { | |||
err := ss.cachedDocs.prepareFields(cFields, ss) | |||
if err != nil { | |||
errCh <- err | |||
} | |||
close(errCh) | |||
}() | |||
} | |||
} | |||
// visit the persisted dv while the cache preparation is in progress | |||
err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) | |||
if ssvOk && ssv != nil && len(vFields) > 0 { | |||
dvs, err = ssv.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) | |||
if err != nil { | |||
return err | |||
return nil, nil, err | |||
} | |||
} | |||
// err out if fieldCache preparation failed | |||
if errCh != nil { | |||
err = <-errCh | |||
if err != nil { | |||
return err | |||
return nil, nil, err | |||
} | |||
} | |||
visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor) | |||
return nil | |||
if len(cFields) > 0 { | |||
ss.cachedDocs.visitDoc(localDocNum, cFields, visitor) | |||
} | |||
return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor) | |||
return cFields, dvs, nil | |||
} | |||
func (i *IndexSnapshot) DocValueReader(fields []string) ( | |||
index.DocValueReader, error) { | |||
return &DocValueReader{i: i, fields: fields, currSegmentIndex: -1}, nil | |||
} | |||
type DocValueReader struct { | |||
i *IndexSnapshot | |||
fields []string | |||
dvs segment.DocVisitState | |||
currSegmentIndex int | |||
currCachedFields []string | |||
} | |||
func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string, | |||
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error { | |||
err := ss.cachedDocs.prepareFields(fields, ss) | |||
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, | |||
visitor index.DocumentFieldTermVisitor) (err error) { | |||
docNum, err := docInternalToNumber(id) | |||
if err != nil { | |||
return err | |||
} | |||
visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor) | |||
return nil | |||
segmentIndex, localDocNum := dvr.i.segmentIndexAndLocalDocNumFromGlobal(docNum) | |||
if segmentIndex >= len(dvr.i.segment) { | |||
return nil | |||
} | |||
if dvr.currSegmentIndex != segmentIndex { | |||
dvr.currSegmentIndex = segmentIndex | |||
dvr.currCachedFields = nil | |||
} | |||
dvr.currCachedFields, dvr.dvs, err = dvr.i.documentVisitFieldTermsOnSegment( | |||
dvr.currSegmentIndex, localDocNum, dvr.fields, dvr.currCachedFields, visitor, dvr.dvs) | |||
return err | |||
} | |||
func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string, | |||
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) { | |||
func (i *IndexSnapshot) DumpAll() chan interface{} { | |||
rv := make(chan interface{}) | |||
go func() { | |||
close(rv) | |||
}() | |||
return rv | |||
} | |||
for _, field := range fields { | |||
if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { | |||
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { | |||
for { | |||
i := bytes.Index(tlist, TermSeparatorSplitSlice) | |||
if i < 0 { | |||
break | |||
} | |||
visitor(field, tlist[0:i]) | |||
tlist = tlist[i+1:] | |||
} | |||
} | |||
} | |||
} | |||
func (i *IndexSnapshot) DumpDoc(id string) chan interface{} { | |||
rv := make(chan interface{}) | |||
go func() { | |||
close(rv) | |||
}() | |||
return rv | |||
} | |||
func (i *IndexSnapshot) DumpFields() chan interface{} { | |||
rv := make(chan interface{}) | |||
go func() { | |||
close(rv) | |||
}() | |||
return rv | |||
} | |||
func extractDvPendingFields(requestedFields, persistedFields []string) []string { | |||
removeMap := map[string]struct{}{} | |||
for _, str := range persistedFields { | |||
removeMap[str] = struct{}{} | |||
// subtractStrings returns set a minus elements of set b. | |||
func subtractStrings(a, b []string) []string { | |||
if len(b) == 0 { | |||
return a | |||
} | |||
rv := make([]string, 0, len(requestedFields)) | |||
for _, s := range requestedFields { | |||
if _, ok := removeMap[s]; !ok { | |||
rv = append(rv, s) | |||
rv := make([]string, 0, len(a)) | |||
OUTER: | |||
for _, as := range a { | |||
for _, bs := range b { | |||
if as == bs { | |||
continue OUTER | |||
} | |||
} | |||
rv = append(rv, as) | |||
} | |||
return rv | |||
} |
@@ -23,12 +23,13 @@ import ( | |||
type segmentDictCursor struct { | |||
itr segment.DictionaryIterator | |||
curr *index.DictEntry | |||
curr index.DictEntry | |||
} | |||
type IndexSnapshotFieldDict struct { | |||
snapshot *IndexSnapshot | |||
cursors []*segmentDictCursor | |||
entry index.DictEntry | |||
} | |||
func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) } | |||
@@ -51,10 +52,10 @@ func (i *IndexSnapshotFieldDict) Pop() interface{} { | |||
} | |||
func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { | |||
if len(i.cursors) <= 0 { | |||
if len(i.cursors) == 0 { | |||
return nil, nil | |||
} | |||
rv := i.cursors[0].curr | |||
i.entry = i.cursors[0].curr | |||
next, err := i.cursors[0].itr.Next() | |||
if err != nil { | |||
return nil, err | |||
@@ -64,12 +65,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { | |||
heap.Pop(i) | |||
} else { | |||
// modified heap, fix it | |||
i.cursors[0].curr = next | |||
i.cursors[0].curr = *next | |||
heap.Fix(i, 0) | |||
} | |||
// look for any other entries with the exact same term | |||
for len(i.cursors) > 0 && i.cursors[0].curr.Term == rv.Term { | |||
rv.Count += i.cursors[0].curr.Count | |||
for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term { | |||
i.entry.Count += i.cursors[0].curr.Count | |||
next, err := i.cursors[0].itr.Next() | |||
if err != nil { | |||
return nil, err | |||
@@ -79,12 +80,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { | |||
heap.Pop(i) | |||
} else { | |||
// modified heap, fix it | |||
i.cursors[0].curr = next | |||
i.cursors[0].curr = *next | |||
heap.Fix(i, 0) | |||
} | |||
} | |||
return rv, nil | |||
return &i.entry, nil | |||
} | |||
func (i *IndexSnapshotFieldDict) Close() error { |
@@ -16,17 +16,30 @@ package scorch | |||
import ( | |||
"bytes" | |||
"reflect" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeIndexSnapshotDocIDReader int | |||
func init() { | |||
var isdr IndexSnapshotDocIDReader | |||
reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size()) | |||
} | |||
type IndexSnapshotDocIDReader struct { | |||
snapshot *IndexSnapshot | |||
iterators []roaring.IntIterable | |||
segmentOffset int | |||
} | |||
func (i *IndexSnapshotDocIDReader) Size() int { | |||
return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr | |||
} | |||
func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { | |||
for i.segmentOffset < len(i.iterators) { | |||
if !i.iterators[i.segmentOffset].HasNext() { |
@@ -16,16 +16,27 @@ package scorch | |||
import ( | |||
"bytes" | |||
"fmt" | |||
"reflect" | |||
"sync/atomic" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeIndexSnapshotTermFieldReader int | |||
func init() { | |||
var istfr IndexSnapshotTermFieldReader | |||
reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size()) | |||
} | |||
type IndexSnapshotTermFieldReader struct { | |||
term []byte | |||
field string | |||
snapshot *IndexSnapshot | |||
dicts []segment.TermDictionary | |||
postings []segment.PostingsList | |||
iterators []segment.PostingsIterator | |||
segmentOffset int | |||
@@ -36,13 +47,34 @@ type IndexSnapshotTermFieldReader struct { | |||
currID index.IndexInternalID | |||
} | |||
func (i *IndexSnapshotTermFieldReader) Size() int { | |||
sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr + | |||
len(i.term) + | |||
len(i.field) + | |||
len(i.currID) | |||
for _, entry := range i.postings { | |||
sizeInBytes += entry.Size() | |||
} | |||
for _, entry := range i.iterators { | |||
sizeInBytes += entry.Size() | |||
} | |||
if i.currPosting != nil { | |||
sizeInBytes += i.currPosting.Size() | |||
} | |||
return sizeInBytes | |||
} | |||
func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { | |||
rv := preAlloced | |||
if rv == nil { | |||
rv = &index.TermFieldDoc{} | |||
} | |||
// find the next hit | |||
for i.segmentOffset < len(i.postings) { | |||
for i.segmentOffset < len(i.iterators) { | |||
next, err := i.iterators[i.segmentOffset].Next() | |||
if err != nil { | |||
return nil, err | |||
@@ -72,9 +104,16 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin | |||
} | |||
if i.includeTermVectors { | |||
locs := next.Locations() | |||
rv.Vectors = make([]*index.TermFieldVector, len(locs)) | |||
if cap(rv.Vectors) < len(locs) { | |||
rv.Vectors = make([]*index.TermFieldVector, len(locs)) | |||
backing := make([]index.TermFieldVector, len(locs)) | |||
for i := range backing { | |||
rv.Vectors[i] = &backing[i] | |||
} | |||
} | |||
rv.Vectors = rv.Vectors[:len(locs)] | |||
for i, loc := range locs { | |||
rv.Vectors[i] = &index.TermFieldVector{ | |||
*rv.Vectors[i] = index.TermFieldVector{ | |||
Start: loc.Start(), | |||
End: loc.End(), | |||
Pos: loc.Pos(), | |||
@@ -96,24 +135,37 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo | |||
} | |||
*i = *(i2.(*IndexSnapshotTermFieldReader)) | |||
} | |||
// FIXME do something better | |||
next, err := i.Next(preAlloced) | |||
num, err := docInternalToNumber(ID) | |||
if err != nil { | |||
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) | |||
} | |||
segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num) | |||
if segIndex >= len(i.snapshot.segment) { | |||
return nil, fmt.Errorf("computed segment index %d out of bounds %d", | |||
segIndex, len(i.snapshot.segment)) | |||
} | |||
// skip directly to the target segment | |||
i.segmentOffset = segIndex | |||
next, err := i.iterators[i.segmentOffset].Advance(ldocNum) | |||
if err != nil { | |||
return nil, err | |||
} | |||
if next == nil { | |||
return nil, nil | |||
// we jumped directly to the segment that should have contained it | |||
// but it wasn't there, so reuse Next() which should correctly | |||
// get the next hit after it (we moved i.segmentOffset) | |||
return i.Next(preAlloced) | |||
} | |||
for bytes.Compare(next.ID, ID) < 0 { | |||
next, err = i.Next(preAlloced) | |||
if err != nil { | |||
return nil, err | |||
} | |||
if next == nil { | |||
break | |||
} | |||
if preAlloced == nil { | |||
preAlloced = &index.TermFieldDoc{} | |||
} | |||
return next, nil | |||
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ | |||
i.snapshot.offsets[segIndex]) | |||
i.postingToTermFieldDoc(next, preAlloced) | |||
i.currID = preAlloced.ID | |||
i.currPosting = next | |||
return preAlloced, nil | |||
} | |||
func (i *IndexSnapshotTermFieldReader) Count() uint64 { | |||
@@ -126,7 +178,8 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 { | |||
func (i *IndexSnapshotTermFieldReader) Close() error { | |||
if i.snapshot != nil { | |||
atomic.AddUint64(&i.snapshot.parent.stats.termSearchersFinished, uint64(1)) | |||
atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1)) | |||
i.snapshot.recycleTermFieldReader(i) | |||
} | |||
return nil | |||
} |
@@ -19,7 +19,7 @@ import ( | |||
"log" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/boltdb/bolt" | |||
bolt "github.com/etcd-io/bbolt" | |||
) | |||
type RollbackPoint struct { |
@@ -15,42 +15,25 @@ | |||
package scorch | |||
import ( | |||
"bytes" | |||
"sync" | |||
"sync/atomic" | |||
"github.com/RoaringBitmap/roaring" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/scorch/segment" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var TermSeparator byte = 0xff | |||
var TermSeparatorSplitSlice = []byte{TermSeparator} | |||
type SegmentDictionarySnapshot struct { | |||
s *SegmentSnapshot | |||
d segment.TermDictionary | |||
} | |||
func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { | |||
// TODO: if except is non-nil, perhaps need to OR it with s.s.deleted? | |||
return s.d.PostingsList(term, s.s.deleted) | |||
} | |||
func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator { | |||
return s.d.Iterator() | |||
} | |||
func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator { | |||
return s.d.PrefixIterator(prefix) | |||
} | |||
func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator { | |||
return s.d.RangeIterator(start, end) | |||
} | |||
type SegmentSnapshot struct { | |||
id uint64 | |||
segment segment.Segment | |||
deleted *roaring.Bitmap | |||
creator string | |||
cachedDocs *cachedDocs | |||
} | |||
@@ -83,8 +66,11 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel | |||
return s.segment.VisitDocument(num, visitor) | |||
} | |||
func (s *SegmentSnapshot) Count() uint64 { | |||
func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) { | |||
return s.segment.DocID(num) | |||
} | |||
func (s *SegmentSnapshot) Count() uint64 { | |||
rv := s.segment.Count() | |||
if s.deleted != nil { | |||
rv -= s.deleted.GetCardinality() | |||
@@ -92,17 +78,6 @@ func (s *SegmentSnapshot) Count() uint64 { | |||
return rv | |||
} | |||
func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) { | |||
d, err := s.segment.Dictionary(field) | |||
if err != nil { | |||
return nil, err | |||
} | |||
return &SegmentDictionarySnapshot{ | |||
s: s, | |||
d: d, | |||
}, nil | |||
} | |||
func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { | |||
rv, err := s.segment.DocNumbers(docIDs) | |||
if err != nil { | |||
@@ -114,7 +89,7 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { | |||
return rv, nil | |||
} | |||
// DocNumbersLive returns bitsit containing doc numbers for all live docs | |||
// DocNumbersLive returns a bitmap containing doc numbers for all live docs | |||
func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap { | |||
rv := roaring.NewBitmap() | |||
rv.AddRange(0, s.segment.Count()) | |||
@@ -128,36 +103,68 @@ func (s *SegmentSnapshot) Fields() []string { | |||
return s.segment.Fields() | |||
} | |||
func (s *SegmentSnapshot) Size() (rv int) { | |||
rv = s.segment.Size() | |||
if s.deleted != nil { | |||
rv += int(s.deleted.GetSizeInBytes()) | |||
} | |||
rv += s.cachedDocs.Size() | |||
return | |||
} | |||
type cachedFieldDocs struct { | |||
m sync.Mutex | |||
readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used. | |||
err error // Non-nil if there was an error when preparing this cachedFieldDocs. | |||
docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF. | |||
size uint64 | |||
} | |||
func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { | |||
defer close(cfd.readyCh) | |||
func (cfd *cachedFieldDocs) Size() int { | |||
var rv int | |||
cfd.m.Lock() | |||
for _, entry := range cfd.docs { | |||
rv += 8 /* size of uint64 */ + len(entry) | |||
} | |||
cfd.m.Unlock() | |||
return rv | |||
} | |||
func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) { | |||
cfd.m.Lock() | |||
defer func() { | |||
close(cfd.readyCh) | |||
cfd.m.Unlock() | |||
}() | |||
cfd.size += uint64(size.SizeOfUint64) /* size field */ | |||
dict, err := ss.segment.Dictionary(field) | |||
if err != nil { | |||
cfd.err = err | |||
return | |||
} | |||
var postings segment.PostingsList | |||
var postingsItr segment.PostingsIterator | |||
dictItr := dict.Iterator() | |||
next, err := dictItr.Next() | |||
for err == nil && next != nil { | |||
postings, err1 := dict.PostingsList(next.Term, nil) | |||
var err1 error | |||
postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings) | |||
if err1 != nil { | |||
cfd.err = err1 | |||
return | |||
} | |||
postingsItr := postings.Iterator() | |||
cfd.size += uint64(size.SizeOfUint64) /* map key */ | |||
postingsItr = postings.Iterator(false, false, false, postingsItr) | |||
nextPosting, err2 := postingsItr.Next() | |||
for err2 == nil && nextPosting != nil { | |||
docNum := nextPosting.Number() | |||
cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...) | |||
cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator) | |||
cfd.size += uint64(len(next.Term) + 1) // map value | |||
nextPosting, err2 = postingsItr.Next() | |||
} | |||
@@ -178,10 +185,12 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { | |||
type cachedDocs struct { | |||
m sync.Mutex // As the cache is asynchronously prepared, need a lock | |||
cache map[string]*cachedFieldDocs // Keyed by field | |||
size uint64 | |||
} | |||
func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error { | |||
c.m.Lock() | |||
if c.cache == nil { | |||
c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields())) | |||
} | |||
@@ -194,7 +203,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e | |||
docs: make(map[uint64][]byte), | |||
} | |||
go c.cache[field].prepareFields(field, ss) | |||
go c.cache[field].prepareField(field, ss) | |||
} | |||
} | |||
@@ -209,21 +218,62 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e | |||
c.m.Lock() | |||
} | |||
c.updateSizeLOCKED() | |||
c.m.Unlock() | |||
return nil | |||
} | |||
func (c *cachedDocs) sizeInBytes() uint64 { | |||
sizeInBytes := 0 | |||
// hasFields returns true if the cache has all the given fields | |||
func (c *cachedDocs) hasFields(fields []string) bool { | |||
c.m.Lock() | |||
for _, field := range fields { | |||
if _, exists := c.cache[field]; !exists { | |||
c.m.Unlock() | |||
return false // found a field not in cache | |||
} | |||
} | |||
c.m.Unlock() | |||
return true | |||
} | |||
func (c *cachedDocs) Size() int { | |||
return int(atomic.LoadUint64(&c.size)) | |||
} | |||
func (c *cachedDocs) updateSizeLOCKED() { | |||
sizeInBytes := 0 | |||
for k, v := range c.cache { // cachedFieldDocs | |||
sizeInBytes += len(k) | |||
if v != nil { | |||
for _, entry := range v.docs { // docs | |||
sizeInBytes += 8 /* size of uint64 */ + len(entry) | |||
sizeInBytes += v.Size() | |||
} | |||
} | |||
atomic.StoreUint64(&c.size, uint64(sizeInBytes)) | |||
} | |||
func (c *cachedDocs) visitDoc(localDocNum uint64, | |||
fields []string, visitor index.DocumentFieldTermVisitor) { | |||
c.m.Lock() | |||
for _, field := range fields { | |||
if cachedFieldDocs, exists := c.cache[field]; exists { | |||
c.m.Unlock() | |||
<-cachedFieldDocs.readyCh | |||
c.m.Lock() | |||
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { | |||
for { | |||
i := bytes.Index(tlist, TermSeparatorSplitSlice) | |||
if i < 0 { | |||
break | |||
} | |||
visitor(field, tlist[0:i]) | |||
tlist = tlist[i+1:] | |||
} | |||
} | |||
} | |||
} | |||
c.m.Unlock() | |||
return uint64(sizeInBytes) | |||
} |
@@ -16,63 +16,125 @@ package scorch | |||
import ( | |||
"encoding/json" | |||
"io/ioutil" | |||
"reflect" | |||
"sync/atomic" | |||
) | |||
// Stats tracks statistics about the index | |||
// Stats tracks statistics about the index, fields that are | |||
// prefixed like CurXxxx are gauges (can go up and down), | |||
// and fields that are prefixed like TotXxxx are monotonically | |||
// increasing counters. | |||
type Stats struct { | |||
updates, deletes, batches, errors uint64 | |||
analysisTime, indexTime uint64 | |||
termSearchersStarted uint64 | |||
termSearchersFinished uint64 | |||
numPlainTextBytesIndexed uint64 | |||
numItemsIntroduced uint64 | |||
numItemsPersisted uint64 | |||
i *Scorch | |||
} | |||
TotUpdates uint64 | |||
TotDeletes uint64 | |||
func (s *Stats) statsMap() (map[string]interface{}, error) { | |||
m := map[string]interface{}{} | |||
m["updates"] = atomic.LoadUint64(&s.updates) | |||
m["deletes"] = atomic.LoadUint64(&s.deletes) | |||
m["batches"] = atomic.LoadUint64(&s.batches) | |||
m["errors"] = atomic.LoadUint64(&s.errors) | |||
m["analysis_time"] = atomic.LoadUint64(&s.analysisTime) | |||
m["index_time"] = atomic.LoadUint64(&s.indexTime) | |||
m["term_searchers_started"] = atomic.LoadUint64(&s.termSearchersStarted) | |||
m["term_searchers_finished"] = atomic.LoadUint64(&s.termSearchersFinished) | |||
m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&s.numPlainTextBytesIndexed) | |||
m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced) | |||
m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted) | |||
if s.i.path != "" { | |||
finfos, err := ioutil.ReadDir(s.i.path) | |||
if err != nil { | |||
return nil, err | |||
} | |||
TotBatches uint64 | |||
TotBatchesEmpty uint64 | |||
TotBatchIntroTime uint64 | |||
MaxBatchIntroTime uint64 | |||
var numFilesOnDisk, numBytesUsedDisk uint64 | |||
CurRootEpoch uint64 | |||
LastPersistedEpoch uint64 | |||
LastMergedEpoch uint64 | |||
for _, finfo := range finfos { | |||
if !finfo.IsDir() { | |||
numBytesUsedDisk += uint64(finfo.Size()) | |||
numFilesOnDisk++ | |||
} | |||
} | |||
TotOnErrors uint64 | |||
m["num_bytes_used_disk"] = numBytesUsedDisk | |||
m["num_files_on_disk"] = numFilesOnDisk | |||
} | |||
TotAnalysisTime uint64 | |||
TotIndexTime uint64 | |||
TotIndexedPlainTextBytes uint64 | |||
TotTermSearchersStarted uint64 | |||
TotTermSearchersFinished uint64 | |||
TotIntroduceLoop uint64 | |||
TotIntroduceSegmentBeg uint64 | |||
TotIntroduceSegmentEnd uint64 | |||
TotIntroducePersistBeg uint64 | |||
TotIntroducePersistEnd uint64 | |||
TotIntroduceMergeBeg uint64 | |||
TotIntroduceMergeEnd uint64 | |||
TotIntroduceRevertBeg uint64 | |||
TotIntroduceRevertEnd uint64 | |||
TotIntroducedItems uint64 | |||
TotIntroducedSegmentsBatch uint64 | |||
TotIntroducedSegmentsMerge uint64 | |||
TotPersistLoopBeg uint64 | |||
TotPersistLoopErr uint64 | |||
TotPersistLoopProgress uint64 | |||
TotPersistLoopWait uint64 | |||
TotPersistLoopWaitNotified uint64 | |||
TotPersistLoopEnd uint64 | |||
TotPersistedItems uint64 | |||
TotItemsToPersist uint64 | |||
TotPersistedSegments uint64 | |||
TotPersisterSlowMergerPause uint64 | |||
TotPersisterSlowMergerResume uint64 | |||
TotPersisterNapPauseCompleted uint64 | |||
TotPersisterMergerNapBreak uint64 | |||
return m, nil | |||
TotFileMergeLoopBeg uint64 | |||
TotFileMergeLoopErr uint64 | |||
TotFileMergeLoopEnd uint64 | |||
TotFileMergePlan uint64 | |||
TotFileMergePlanErr uint64 | |||
TotFileMergePlanNone uint64 | |||
TotFileMergePlanOk uint64 | |||
TotFileMergePlanTasks uint64 | |||
TotFileMergePlanTasksDone uint64 | |||
TotFileMergePlanTasksErr uint64 | |||
TotFileMergePlanTasksSegments uint64 | |||
TotFileMergePlanTasksSegmentsEmpty uint64 | |||
TotFileMergeSegmentsEmpty uint64 | |||
TotFileMergeSegments uint64 | |||
TotFileSegmentsAtRoot uint64 | |||
TotFileMergeWrittenBytes uint64 | |||
TotFileMergeZapBeg uint64 | |||
TotFileMergeZapEnd uint64 | |||
TotFileMergeZapTime uint64 | |||
MaxFileMergeZapTime uint64 | |||
TotFileMergeIntroductions uint64 | |||
TotFileMergeIntroductionsDone uint64 | |||
TotFileMergeIntroductionsSkipped uint64 | |||
TotMemMergeBeg uint64 | |||
TotMemMergeErr uint64 | |||
TotMemMergeDone uint64 | |||
TotMemMergeZapBeg uint64 | |||
TotMemMergeZapEnd uint64 | |||
TotMemMergeZapTime uint64 | |||
MaxMemMergeZapTime uint64 | |||
TotMemMergeSegments uint64 | |||
TotMemorySegmentsAtRoot uint64 | |||
} | |||
// MarshalJSON implements json.Marshaler | |||
func (s *Stats) MarshalJSON() ([]byte, error) { | |||
m, err := s.statsMap() | |||
if err != nil { | |||
return nil, err | |||
// atomically populates the returned map | |||
func (s *Stats) ToMap() map[string]interface{} { | |||
m := map[string]interface{}{} | |||
sve := reflect.ValueOf(s).Elem() | |||
svet := sve.Type() | |||
for i := 0; i < svet.NumField(); i++ { | |||
svef := sve.Field(i) | |||
if svef.CanAddr() { | |||
svefp := svef.Addr().Interface() | |||
m[svet.Field(i).Name] = atomic.LoadUint64(svefp.(*uint64)) | |||
} | |||
} | |||
return json.Marshal(m) | |||
return m | |||
} | |||
// MarshalJSON implements json.Marshaler, and in contrast to standard | |||
// json marshaling provides atomic safety | |||
func (s *Stats) MarshalJSON() ([]byte, error) { | |||
return json.Marshal(s.ToMap()) | |||
} |
@@ -17,7 +17,7 @@ package boltdb | |||
import ( | |||
"bytes" | |||
"github.com/boltdb/bolt" | |||
bolt "github.com/etcd-io/bbolt" | |||
) | |||
type Iterator struct { |
@@ -16,7 +16,7 @@ package boltdb | |||
import ( | |||
"github.com/blevesearch/bleve/index/store" | |||
"github.com/boltdb/bolt" | |||
bolt "github.com/etcd-io/bbolt" | |||
) | |||
type Reader struct { |
@@ -30,7 +30,7 @@ import ( | |||
"github.com/blevesearch/bleve/index/store" | |||
"github.com/blevesearch/bleve/registry" | |||
"github.com/boltdb/bolt" | |||
bolt "github.com/etcd-io/bbolt" | |||
) | |||
const ( | |||
@@ -74,6 +74,12 @@ func New(mo store.MergeOperator, config map[string]interface{}) (store.KVStore, | |||
bo.ReadOnly = ro | |||
} | |||
if initialMmapSize, ok := config["initialMmapSize"].(int); ok { | |||
bo.InitialMmapSize = initialMmapSize | |||
} else if initialMmapSize, ok := config["initialMmapSize"].(float64); ok { | |||
bo.InitialMmapSize = int(initialMmapSize) | |||
} | |||
db, err := bolt.Open(path, 0600, bo) | |||
if err != nil { | |||
return nil, err |
@@ -15,11 +15,20 @@ | |||
package upsidedown | |||
import ( | |||
"reflect" | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/store" | |||
) | |||
var reflectStaticSizeIndexReader int | |||
func init() { | |||
var ir IndexReader | |||
reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size()) | |||
} | |||
type IndexReader struct { | |||
index *UpsideDownCouch | |||
kvreader store.KVReader | |||
@@ -201,3 +210,17 @@ func incrementBytes(in []byte) []byte { | |||
} | |||
return rv | |||
} | |||
func (i *IndexReader) DocValueReader(fields []string) (index.DocValueReader, error) { | |||
return &DocValueReader{i: i, fields: fields}, nil | |||
} | |||
type DocValueReader struct { | |||
i *IndexReader | |||
fields []string | |||
} | |||
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, | |||
visitor index.DocumentFieldTermVisitor) error { | |||
return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor) | |||
} |
@@ -16,13 +16,27 @@ package upsidedown | |||
import ( | |||
"bytes" | |||
"reflect" | |||
"sort" | |||
"sync/atomic" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/index/store" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeUpsideDownCouchTermFieldReader int | |||
var reflectStaticSizeUpsideDownCouchDocIDReader int | |||
func init() { | |||
var tfr UpsideDownCouchTermFieldReader | |||
reflectStaticSizeUpsideDownCouchTermFieldReader = | |||
int(reflect.TypeOf(tfr).Size()) | |||
var cdr UpsideDownCouchDocIDReader | |||
reflectStaticSizeUpsideDownCouchDocIDReader = | |||
int(reflect.TypeOf(cdr).Size()) | |||
} | |||
type UpsideDownCouchTermFieldReader struct { | |||
count uint64 | |||
indexReader *IndexReader | |||
@@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct { | |||
includeTermVectors bool | |||
} | |||
func (r *UpsideDownCouchTermFieldReader) Size() int { | |||
sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr + | |||
len(r.term) + | |||
r.tfrPrealloc.Size() + | |||
len(r.keyBuf) | |||
if r.tfrNext != nil { | |||
sizeInBytes += r.tfrNext.Size() | |||
} | |||
return sizeInBytes | |||
} | |||
func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) { | |||
bufNeeded := termFrequencyRowKeySize(term, nil) | |||
if bufNeeded < dictionaryRowKeySize(term) { | |||
@@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct { | |||
onlyMode bool | |||
} | |||
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { | |||
func (r *UpsideDownCouchDocIDReader) Size() int { | |||
sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader + | |||
reflectStaticSizeIndexReader + size.SizeOfPtr | |||
for _, entry := range r.only { | |||
sizeInBytes += size.SizeOfString + len(entry) | |||
} | |||
return sizeInBytes | |||
} | |||
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { | |||
startBytes := []byte{0x0} | |||
endBytes := []byte{0xff} | |||
@@ -20,10 +20,22 @@ import ( | |||
"fmt" | |||
"io" | |||
"math" | |||
"reflect" | |||
"github.com/blevesearch/bleve/size" | |||
"github.com/golang/protobuf/proto" | |||
) | |||
var reflectStaticSizeTermFrequencyRow int | |||
var reflectStaticSizeTermVector int | |||
func init() { | |||
var tfr TermFrequencyRow | |||
reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size()) | |||
var tv TermVector | |||
reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size()) | |||
} | |||
const ByteSeparator byte = 0xff | |||
type UpsideDownCouchRowStream chan UpsideDownCouchRow | |||
@@ -358,6 +370,11 @@ type TermVector struct { | |||
end uint64 | |||
} | |||
func (tv *TermVector) Size() int { | |||
return reflectStaticSizeTermVector + size.SizeOfPtr + | |||
len(tv.arrayPositions)*size.SizeOfUint64 | |||
} | |||
func (tv *TermVector) String() string { | |||
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions) | |||
} | |||
@@ -371,6 +388,18 @@ type TermFrequencyRow struct { | |||
field uint16 | |||
} | |||
func (tfr *TermFrequencyRow) Size() int { | |||
sizeInBytes := reflectStaticSizeTermFrequencyRow + | |||
len(tfr.term) + | |||
len(tfr.doc) | |||
for _, entry := range tfr.vectors { | |||
sizeInBytes += entry.Size() | |||
} | |||
return sizeInBytes | |||
} | |||
func (tfr *TermFrequencyRow) Term() []byte { | |||
return tfr.term | |||
} | |||
@@ -555,7 +584,7 @@ func (tfr *TermFrequencyRow) parseK(key []byte) error { | |||
func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error { | |||
tfr.doc = key[3+len(term)+1:] | |||
if len(tfr.doc) <= 0 { | |||
if len(tfr.doc) == 0 { | |||
return fmt.Errorf("invalid term frequency key, empty docid") | |||
} | |||
@@ -775,7 +775,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis. | |||
} | |||
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector { | |||
if len(in) <= 0 { | |||
if len(in) == 0 { | |||
return nil | |||
} | |||
@@ -810,15 +810,17 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { | |||
} | |||
} | |||
go func() { | |||
for _, doc := range batch.IndexOps { | |||
if doc != nil { | |||
aw := index.NewAnalysisWork(udc, doc, resultChan) | |||
// put the work on the queue | |||
udc.analysisQueue.Queue(aw) | |||
if len(batch.IndexOps) > 0 { | |||
go func() { | |||
for _, doc := range batch.IndexOps { | |||
if doc != nil { | |||
aw := index.NewAnalysisWork(udc, doc, resultChan) | |||
// put the work on the queue | |||
udc.analysisQueue.Queue(aw) | |||
} | |||
} | |||
} | |||
}() | |||
}() | |||
} | |||
// retrieve back index rows concurrent with analysis | |||
docBackIndexRowErr := error(nil) | |||
@@ -958,6 +960,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { | |||
} else { | |||
atomic.AddUint64(&udc.stats.errors, 1) | |||
} | |||
persistedCallback := batch.PersistedCallback() | |||
if persistedCallback != nil { | |||
persistedCallback(err) | |||
} | |||
return | |||
} | |||
@@ -433,6 +433,7 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest { | |||
Explain: req.Explain, | |||
Sort: req.Sort.Copy(), | |||
IncludeLocations: req.IncludeLocations, | |||
Score: req.Score, | |||
} | |||
return &rv | |||
} |
@@ -50,6 +50,12 @@ const storePath = "store" | |||
var mappingInternalKey = []byte("_mapping") | |||
const SearchQueryStartCallbackKey = "_search_query_start_callback_key" | |||
const SearchQueryEndCallbackKey = "_search_query_end_callback_key" | |||
type SearchQueryStartCallbackFn func(size uint64) error | |||
type SearchQueryEndCallbackFn func(size uint64) error | |||
func indexStorePath(path string) string { | |||
return path + string(os.PathSeparator) + storePath | |||
} | |||
@@ -362,8 +368,70 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) { | |||
return i.SearchInContext(context.Background(), req) | |||
} | |||
var documentMatchEmptySize int | |||
var searchContextEmptySize int | |||
var facetResultEmptySize int | |||
var documentEmptySize int | |||
func init() { | |||
var dm search.DocumentMatch | |||
documentMatchEmptySize = dm.Size() | |||
var sc search.SearchContext | |||
searchContextEmptySize = sc.Size() | |||
var fr search.FacetResult | |||
facetResultEmptySize = fr.Size() | |||
var d document.Document | |||
documentEmptySize = d.Size() | |||
} | |||
// memNeededForSearch is a helper function that returns an estimate of RAM | |||
// needed to execute a search request. | |||
func memNeededForSearch(req *SearchRequest, | |||
searcher search.Searcher, | |||
topnCollector *collector.TopNCollector) uint64 { | |||
backingSize := req.Size + req.From + 1 | |||
if req.Size+req.From > collector.PreAllocSizeSkipCap { | |||
backingSize = collector.PreAllocSizeSkipCap + 1 | |||
} | |||
numDocMatches := backingSize + searcher.DocumentMatchPoolSize() | |||
estimate := 0 | |||
// overhead, size in bytes from collector | |||
estimate += topnCollector.Size() | |||
// pre-allocing DocumentMatchPool | |||
estimate += searchContextEmptySize + numDocMatches*documentMatchEmptySize | |||
// searcher overhead | |||
estimate += searcher.Size() | |||
// overhead from results, lowestMatchOutsideResults | |||
estimate += (numDocMatches + 1) * documentMatchEmptySize | |||
// additional overhead from SearchResult | |||
estimate += reflectStaticSizeSearchResult + reflectStaticSizeSearchStatus | |||
// overhead from facet results | |||
if req.Facets != nil { | |||
estimate += len(req.Facets) * facetResultEmptySize | |||
} | |||
// highlighting, store | |||
if len(req.Fields) > 0 || req.Highlight != nil { | |||
// Size + From => number of hits | |||
estimate += (req.Size + req.From) * documentEmptySize | |||
} | |||
return uint64(estimate) | |||
} | |||
// SearchInContext executes a search request operation within the provided | |||
// Context. Returns a SearchResult object or an error. | |||
// Context. Returns a SearchResult object or an error. | |||
func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) { | |||
i.mutex.RLock() | |||
defer i.mutex.RUnlock() | |||
@@ -390,6 +458,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr | |||
searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{ | |||
Explain: req.Explain, | |||
IncludeTermVectors: req.IncludeLocations || req.Highlight != nil, | |||
Score: req.Score, | |||
}) | |||
if err != nil { | |||
return nil, err | |||
@@ -428,6 +497,24 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr | |||
collector.SetFacetsBuilder(facetsBuilder) | |||
} | |||
memNeeded := memNeededForSearch(req, searcher, collector) | |||
if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil { | |||
if cbF, ok := cb.(SearchQueryStartCallbackFn); ok { | |||
err = cbF(memNeeded) | |||
} | |||
} | |||
if err != nil { | |||
return nil, err | |||
} | |||
if cb := ctx.Value(SearchQueryEndCallbackKey); cb != nil { | |||
if cbF, ok := cb.(SearchQueryEndCallbackFn); ok { | |||
defer func() { | |||
_ = cbF(memNeeded) | |||
}() | |||
} | |||
} | |||
err = collector.Collect(ctx, searcher, indexReader) | |||
if err != nil { | |||
return nil, err | |||
@@ -459,7 +546,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr | |||
doc, err := indexReader.Document(hit.ID) | |||
if err == nil && doc != nil { | |||
if len(req.Fields) > 0 { | |||
for _, f := range req.Fields { | |||
fieldsToLoad := deDuplicate(req.Fields) | |||
for _, f := range fieldsToLoad { | |||
for _, docF := range doc.Fields { | |||
if f == "*" || docF.Name() == f { | |||
var value interface{} | |||
@@ -533,9 +621,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr | |||
return &SearchResult{ | |||
Status: &SearchStatus{ | |||
Total: 1, | |||
Failed: 0, | |||
Successful: 1, | |||
Errors: make(map[string]error), | |||
}, | |||
Request: req, | |||
Hits: hits, | |||
@@ -755,3 +841,16 @@ func (f *indexImplFieldDict) Close() error { | |||
} | |||
return f.indexReader.Close() | |||
} | |||
// helper function to remove duplicate entries from slice of strings | |||
func deDuplicate(fields []string) []string { | |||
entries := make(map[string]struct{}) | |||
ret := []string{} | |||
for _, entry := range fields { | |||
if _, exists := entries[entry]; !exists { | |||
entries[entry] = struct{}{} | |||
ret = append(ret, entry) | |||
} | |||
} | |||
return ret | |||
} |
@@ -18,6 +18,7 @@ import ( | |||
"encoding/json" | |||
"io/ioutil" | |||
"os" | |||
"path/filepath" | |||
"github.com/blevesearch/bleve/index/upsidedown" | |||
) | |||
@@ -92,5 +93,5 @@ func (i *indexMeta) Save(path string) (err error) { | |||
} | |||
func indexMetaPath(path string) string { | |||
return path + string(os.PathSeparator) + metaFilename | |||
return filepath.Join(path, metaFilename) | |||
} |
@@ -42,7 +42,7 @@ type DocumentMapping struct { | |||
Dynamic bool `json:"dynamic"` | |||
Properties map[string]*DocumentMapping `json:"properties,omitempty"` | |||
Fields []*FieldMapping `json:"fields,omitempty"` | |||
DefaultAnalyzer string `json:"default_analyzer"` | |||
DefaultAnalyzer string `json:"default_analyzer,omitempty"` | |||
// StructTagKey overrides "json" when looking for field names in struct tags | |||
StructTagKey string `json:"struct_tag_key,omitempty"` | |||
@@ -324,13 +324,17 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { | |||
} | |||
func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { | |||
// allow default "json" tag to be overriden | |||
// allow default "json" tag to be overridden | |||
structTagKey := dm.StructTagKey | |||
if structTagKey == "" { | |||
structTagKey = "json" | |||
} | |||
val := reflect.ValueOf(data) | |||
if !val.IsValid() { | |||
return | |||
} | |||
typ := val.Type() | |||
switch typ.Kind() { | |||
case reflect.Map: | |||
@@ -420,7 +424,11 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string, | |||
if subDocMapping != nil { | |||
// index by explicit mapping | |||
for _, fieldMapping := range subDocMapping.Fields { | |||
fieldMapping.processString(propertyValueString, pathString, path, indexes, context) | |||
if fieldMapping.Type == "geopoint" { | |||
fieldMapping.processGeoPoint(property, pathString, path, indexes, context) | |||
} else { | |||
fieldMapping.processString(propertyValueString, pathString, path, indexes, context) | |||
} | |||
} | |||
} else if closestDocMapping.Dynamic { | |||
// automatic indexing behavior |
@@ -320,8 +320,8 @@ func (im *IndexMappingImpl) determineType(data interface{}) string { | |||
func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error { | |||
docType := im.determineType(data) | |||
docMapping := im.mappingForType(docType) | |||
walkContext := im.newWalkContext(doc, docMapping) | |||
if docMapping.Enabled { | |||
walkContext := im.newWalkContext(doc, docMapping) | |||
docMapping.walkDocument(data, []string{}, []uint64{}, walkContext) | |||
// see if the _all field was disabled |
@@ -35,6 +35,9 @@ func lookupPropertyPath(data interface{}, path string) interface{} { | |||
func lookupPropertyPathPart(data interface{}, part string) interface{} { | |||
val := reflect.ValueOf(data) | |||
if !val.IsValid() { | |||
return nil | |||
} | |||
typ := val.Type() | |||
switch typ.Kind() { | |||
case reflect.Map: |
@@ -14,7 +14,7 @@ var interleaveShift = []uint{1, 2, 4, 8, 16} | |||
// Interleave the first 32 bits of each uint64 | |||
// apdated from org.apache.lucene.util.BitUtil | |||
// whcih was adapted from: | |||
// which was adapted from: | |||
// http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN | |||
func Interleave(v1, v2 uint64) uint64 { | |||
v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4] |
@@ -77,6 +77,10 @@ func (p PrefixCoded) Int64() (int64, error) { | |||
} | |||
func ValidPrefixCodedTerm(p string) (bool, int) { | |||
return ValidPrefixCodedTermBytes([]byte(p)) | |||
} | |||
func ValidPrefixCodedTermBytes(p []byte) (bool, int) { | |||
if len(p) > 0 { | |||
if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 { | |||
return false, 0 |
@@ -17,15 +17,29 @@ package bleve | |||
import ( | |||
"encoding/json" | |||
"fmt" | |||
"reflect" | |||
"time" | |||
"github.com/blevesearch/bleve/analysis" | |||
"github.com/blevesearch/bleve/analysis/datetime/optional" | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/registry" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/search/collector" | |||
"github.com/blevesearch/bleve/search/query" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeSearchResult int | |||
var reflectStaticSizeSearchStatus int | |||
func init() { | |||
var sr SearchResult | |||
reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size()) | |||
var ss SearchStatus | |||
reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size()) | |||
} | |||
var cache = registry.NewCache() | |||
const defaultDateTimeParser = optional.Name | |||
@@ -247,6 +261,7 @@ func (h *HighlightRequest) AddField(field string) { | |||
// Explain triggers inclusion of additional search | |||
// result score explanations. | |||
// Sort describes the desired order for the results to be returned. | |||
// Score controls the kind of scoring performed | |||
// | |||
// A special field named "*" can be used to return all fields. | |||
type SearchRequest struct { | |||
@@ -259,6 +274,7 @@ type SearchRequest struct { | |||
Explain bool `json:"explain"` | |||
Sort search.SortOrder `json:"sort"` | |||
IncludeLocations bool `json:"includeLocations"` | |||
Score string `json:"score,omitempty"` | |||
} | |||
func (r *SearchRequest) Validate() error { | |||
@@ -308,6 +324,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { | |||
Explain bool `json:"explain"` | |||
Sort []json.RawMessage `json:"sort"` | |||
IncludeLocations bool `json:"includeLocations"` | |||
Score string `json:"score"` | |||
} | |||
err := json.Unmarshal(input, &temp) | |||
@@ -334,6 +351,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { | |||
r.Fields = temp.Fields | |||
r.Facets = temp.Facets | |||
r.IncludeLocations = temp.IncludeLocations | |||
r.Score = temp.Score | |||
r.Query, err = query.ParseQuery(temp.Q) | |||
if err != nil { | |||
return err | |||
@@ -432,6 +450,24 @@ type SearchResult struct { | |||
Facets search.FacetResults `json:"facets"` | |||
} | |||
func (sr *SearchResult) Size() int { | |||
sizeInBytes := reflectStaticSizeSearchResult + size.SizeOfPtr + | |||
reflectStaticSizeSearchStatus | |||
for _, entry := range sr.Hits { | |||
if entry != nil { | |||
sizeInBytes += entry.Size() | |||
} | |||
} | |||
for k, v := range sr.Facets { | |||
sizeInBytes += size.SizeOfString + len(k) + | |||
v.Size() | |||
} | |||
return sizeInBytes | |||
} | |||
func (sr *SearchResult) String() string { | |||
rv := "" | |||
if sr.Total > 0 { | |||
@@ -488,3 +524,44 @@ func (sr *SearchResult) Merge(other *SearchResult) { | |||
sr.Facets.Merge(other.Facets) | |||
} | |||
// MemoryNeededForSearchResult is an exported helper function to determine the RAM | |||
// needed to accommodate the results for a given search request. | |||
func MemoryNeededForSearchResult(req *SearchRequest) uint64 { | |||
if req == nil { | |||
return 0 | |||
} | |||
numDocMatches := req.Size + req.From | |||
if req.Size+req.From > collector.PreAllocSizeSkipCap { | |||
numDocMatches = collector.PreAllocSizeSkipCap | |||
} | |||
estimate := 0 | |||
// overhead from the SearchResult structure | |||
var sr SearchResult | |||
estimate += sr.Size() | |||
var dm search.DocumentMatch | |||
sizeOfDocumentMatch := dm.Size() | |||
// overhead from results | |||
estimate += numDocMatches * sizeOfDocumentMatch | |||
// overhead from facet results | |||
if req.Facets != nil { | |||
var fr search.FacetResult | |||
estimate += len(req.Facets) * fr.Size() | |||
} | |||
// highlighting, store | |||
var d document.Document | |||
if len(req.Fields) > 0 || req.Highlight != nil { | |||
for i := 0; i < (req.Size + req.From); i++ { | |||
estimate += (req.Size + req.From) * d.Size() | |||
} | |||
} | |||
return uint64(estimate) | |||
} |
@@ -30,3 +30,23 @@ type Collector interface { | |||
SetFacetsBuilder(facetsBuilder *FacetsBuilder) | |||
FacetResults() FacetResults | |||
} | |||
// DocumentMatchHandler is the type of document match callback | |||
// bleve will invoke during the search. | |||
// Eventually, bleve will indicate the completion of an ongoing search, | |||
// by passing a nil value for the document match callback. | |||
// The application should take a copy of the hit/documentMatch | |||
// if it wish to own it or need prolonged access to it. | |||
type DocumentMatchHandler func(hit *DocumentMatch) error | |||
type MakeDocumentMatchHandlerKeyType string | |||
var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType( | |||
"MakeDocumentMatchHandlerKey") | |||
// MakeDocumentMatchHandler is an optional DocumentMatchHandler | |||
// builder function which the applications can pass to bleve. | |||
// These builder methods gives a DocumentMatchHandler function | |||
// to bleve, which it will invoke on every document matches. | |||
type MakeDocumentMatchHandler func(ctx *SearchContext) ( | |||
callback DocumentMatchHandler, loadID bool, err error) |
@@ -25,9 +25,9 @@ type collectStoreHeap struct { | |||
compare collectorCompare | |||
} | |||
func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap { | |||
func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap { | |||
rv := &collectStoreHeap{ | |||
heap: make(search.DocumentMatchCollection, 0, cap), | |||
heap: make(search.DocumentMatchCollection, 0, capacity), | |||
compare: compare, | |||
} | |||
heap.Init(rv) |
@@ -25,7 +25,7 @@ type collectStoreList struct { | |||
compare collectorCompare | |||
} | |||
func newStoreList(cap int, compare collectorCompare) *collectStoreList { | |||
func newStoreList(capacity int, compare collectorCompare) *collectStoreList { | |||
rv := &collectStoreList{ | |||
results: list.New(), | |||
compare: compare, | |||
@@ -34,8 +34,7 @@ func newStoreList(cap int, compare collectorCompare) *collectStoreList { | |||
return rv | |||
} | |||
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, | |||
size int) *search.DocumentMatch { | |||
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch { | |||
c.add(doc) | |||
if c.len() > size { | |||
return c.removeLast() |
@@ -21,9 +21,9 @@ type collectStoreSlice struct { | |||
compare collectorCompare | |||
} | |||
func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice { | |||
func newStoreSlice(capacity int, compare collectorCompare) *collectStoreSlice { | |||
rv := &collectStoreSlice{ | |||
slice: make(search.DocumentMatchCollection, 0, cap), | |||
slice: make(search.DocumentMatchCollection, 0, capacity), | |||
compare: compare, | |||
} | |||
return rv |
@@ -16,12 +16,21 @@ package collector | |||
import ( | |||
"context" | |||
"reflect" | |||
"time" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeTopNCollector int | |||
func init() { | |||
var coll TopNCollector | |||
reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size()) | |||
} | |||
type collectorStore interface { | |||
// Add the document, and if the new store size exceeds the provided size | |||
// the last element is removed and returned. If the size has not been | |||
@@ -58,6 +67,8 @@ type TopNCollector struct { | |||
cachedDesc []bool | |||
lowestMatchOutsideResults *search.DocumentMatch | |||
updateFieldVisitor index.DocumentFieldTermVisitor | |||
dvReader index.DocValueReader | |||
} | |||
// CheckDoneEvery controls how frequently we check the context deadline | |||
@@ -98,6 +109,22 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector | |||
return hc | |||
} | |||
func (hc *TopNCollector) Size() int { | |||
sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr | |||
if hc.facetsBuilder != nil { | |||
sizeInBytes += hc.facetsBuilder.Size() | |||
} | |||
for _, entry := range hc.neededFields { | |||
sizeInBytes += len(entry) + size.SizeOfString | |||
} | |||
sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc) | |||
return sizeInBytes | |||
} | |||
// Collect goes to the index to find the matching documents | |||
func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { | |||
startTime := time.Now() | |||
@@ -113,8 +140,34 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, | |||
} | |||
searchContext := &search.SearchContext{ | |||
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)), | |||
Collector: hc, | |||
} | |||
hc.dvReader, err = reader.DocValueReader(hc.neededFields) | |||
if err != nil { | |||
return err | |||
} | |||
hc.updateFieldVisitor = func(field string, term []byte) { | |||
if hc.facetsBuilder != nil { | |||
hc.facetsBuilder.UpdateVisitor(field, term) | |||
} | |||
hc.sort.UpdateVisitor(field, term) | |||
} | |||
dmHandlerMaker := MakeTopNDocumentMatchHandler | |||
if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil { | |||
dmHandlerMaker = cv.(search.MakeDocumentMatchHandler) | |||
} | |||
// use the application given builder for making the custom document match | |||
// handler and perform callbacks/invocations on the newly made handler. | |||
dmHandler, loadID, err := dmHandlerMaker(searchContext) | |||
if err != nil { | |||
return err | |||
} | |||
hc.needDocIds = hc.needDocIds || loadID | |||
select { | |||
case <-ctx.Done(): | |||
return ctx.Err() | |||
@@ -130,13 +183,26 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, | |||
} | |||
} | |||
err = hc.collectSingle(searchContext, reader, next) | |||
err = hc.prepareDocumentMatch(searchContext, reader, next) | |||
if err != nil { | |||
break | |||
} | |||
err = dmHandler(next) | |||
if err != nil { | |||
break | |||
} | |||
next, err = searcher.Next(searchContext) | |||
} | |||
// help finalize/flush the results in case | |||
// of custom document match handlers. | |||
err = dmHandler(nil) | |||
if err != nil { | |||
return err | |||
} | |||
// compute search duration | |||
hc.took = time.Since(startTime) | |||
if err != nil { | |||
@@ -152,8 +218,8 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, | |||
var sortByScoreOpt = []string{"_score"} | |||
func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error { | |||
var err error | |||
func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext, | |||
reader index.IndexReader, d *search.DocumentMatch) (err error) { | |||
// visit field terms for features that require it (sort, facets) | |||
if len(hc.neededFields) > 0 { | |||
@@ -187,33 +253,49 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I | |||
hc.sort.Value(d) | |||
} | |||
// optimization, we track lowest sorting hit already removed from heap | |||
// with this one comparison, we can avoid all heap operations if | |||
// this hit would have been added and then immediately removed | |||
if hc.lowestMatchOutsideResults != nil { | |||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults) | |||
if cmp >= 0 { | |||
// this hit can't possibly be in the result set, so avoid heap ops | |||
ctx.DocumentMatchPool.Put(d) | |||
return nil | |||
} | |||
} | |||
return nil | |||
} | |||
removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip) | |||
if removed != nil { | |||
if hc.lowestMatchOutsideResults == nil { | |||
hc.lowestMatchOutsideResults = removed | |||
} else { | |||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults) | |||
if cmp < 0 { | |||
tmp := hc.lowestMatchOutsideResults | |||
hc.lowestMatchOutsideResults = removed | |||
ctx.DocumentMatchPool.Put(tmp) | |||
func MakeTopNDocumentMatchHandler( | |||
ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) { | |||
var hc *TopNCollector | |||
var ok bool | |||
if hc, ok = ctx.Collector.(*TopNCollector); ok { | |||
return func(d *search.DocumentMatch) error { | |||
if d == nil { | |||
return nil | |||
} | |||
// optimization, we track lowest sorting hit already removed from heap | |||
// with this one comparison, we can avoid all heap operations if | |||
// this hit would have been added and then immediately removed | |||
if hc.lowestMatchOutsideResults != nil { | |||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, | |||
hc.lowestMatchOutsideResults) | |||
if cmp >= 0 { | |||
// this hit can't possibly be in the result set, so avoid heap ops | |||
ctx.DocumentMatchPool.Put(d) | |||
return nil | |||
} | |||
} | |||
} | |||
} | |||
return nil | |||
removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip) | |||
if removed != nil { | |||
if hc.lowestMatchOutsideResults == nil { | |||
hc.lowestMatchOutsideResults = removed | |||
} else { | |||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, | |||
removed, hc.lowestMatchOutsideResults) | |||
if cmp < 0 { | |||
tmp := hc.lowestMatchOutsideResults | |||
hc.lowestMatchOutsideResults = removed | |||
ctx.DocumentMatchPool.Put(tmp) | |||
} | |||
} | |||
} | |||
return nil | |||
}, false, nil | |||
} | |||
return nil, false, nil | |||
} | |||
// visitFieldTerms is responsible for visiting the field terms of the | |||
@@ -223,13 +305,7 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc | |||
hc.facetsBuilder.StartDoc() | |||
} | |||
err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) { | |||
if hc.facetsBuilder != nil { | |||
hc.facetsBuilder.UpdateVisitor(field, term) | |||
} | |||
hc.sort.UpdateVisitor(field, term) | |||
}) | |||
err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor) | |||
if hc.facetsBuilder != nil { | |||
hc.facetsBuilder.EndDoc() | |||
} | |||
@@ -257,6 +333,7 @@ func (hc *TopNCollector) finalizeResults(r index.IndexReader) error { | |||
return err | |||
} | |||
} | |||
doc.Complete(nil) | |||
return nil | |||
}) | |||
@@ -288,5 +365,5 @@ func (hc *TopNCollector) FacetResults() search.FacetResults { | |||
if hc.facetsBuilder != nil { | |||
return hc.facetsBuilder.Results() | |||
} | |||
return search.FacetResults{} | |||
return nil | |||
} |
@@ -17,8 +17,18 @@ package search | |||
import ( | |||
"encoding/json" | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeExplanation int | |||
func init() { | |||
var e Explanation | |||
reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size()) | |||
} | |||
type Explanation struct { | |||
Value float64 `json:"value"` | |||
Message string `json:"message"` | |||
@@ -32,3 +42,14 @@ func (expl *Explanation) String() string { | |||
} | |||
return string(js) | |||
} | |||
func (expl *Explanation) Size() int { | |||
sizeInBytes := reflectStaticSizeExplanation + size.SizeOfPtr + | |||
len(expl.Message) | |||
for _, entry := range expl.Children { | |||
sizeInBytes += entry.Size() | |||
} | |||
return sizeInBytes | |||
} |
@@ -15,13 +15,25 @@ | |||
package facet | |||
import ( | |||
"reflect" | |||
"sort" | |||
"time" | |||
"github.com/blevesearch/bleve/numeric" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeDateTimeFacetBuilder int | |||
var reflectStaticSizedateTimeRange int | |||
func init() { | |||
var dtfb DateTimeFacetBuilder | |||
reflectStaticSizeDateTimeFacetBuilder = int(reflect.TypeOf(dtfb).Size()) | |||
var dtr dateTimeRange | |||
reflectStaticSizedateTimeRange = int(reflect.TypeOf(dtr).Size()) | |||
} | |||
type dateTimeRange struct { | |||
start time.Time | |||
end time.Time | |||
@@ -46,6 +58,23 @@ func NewDateTimeFacetBuilder(field string, size int) *DateTimeFacetBuilder { | |||
} | |||
} | |||
func (fb *DateTimeFacetBuilder) Size() int { | |||
sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr + | |||
len(fb.field) | |||
for k, _ := range fb.termsCount { | |||
sizeInBytes += size.SizeOfString + len(k) + | |||
size.SizeOfInt | |||
} | |||
for k, _ := range fb.ranges { | |||
sizeInBytes += size.SizeOfString + len(k) + | |||
size.SizeOfPtr + reflectStaticSizedateTimeRange | |||
} | |||
return sizeInBytes | |||
} | |||
func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) { | |||
r := dateTimeRange{ | |||
start: start, |
@@ -15,12 +15,24 @@ | |||
package facet | |||
import ( | |||
"reflect" | |||
"sort" | |||
"github.com/blevesearch/bleve/numeric" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeNumericFacetBuilder int | |||
var reflectStaticSizenumericRange int | |||
func init() { | |||
var nfb NumericFacetBuilder | |||
reflectStaticSizeNumericFacetBuilder = int(reflect.TypeOf(nfb).Size()) | |||
var nr numericRange | |||
reflectStaticSizenumericRange = int(reflect.TypeOf(nr).Size()) | |||
} | |||
type numericRange struct { | |||
min *float64 | |||
max *float64 | |||
@@ -45,6 +57,23 @@ func NewNumericFacetBuilder(field string, size int) *NumericFacetBuilder { | |||
} | |||
} | |||
func (fb *NumericFacetBuilder) Size() int { | |||
sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr + | |||
len(fb.field) | |||
for k, _ := range fb.termsCount { | |||
sizeInBytes += size.SizeOfString + len(k) + | |||
size.SizeOfInt | |||
} | |||
for k, _ := range fb.ranges { | |||
sizeInBytes += size.SizeOfString + len(k) + | |||
size.SizeOfPtr + reflectStaticSizenumericRange | |||
} | |||
return sizeInBytes | |||
} | |||
func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) { | |||
r := numericRange{ | |||
min: min, |
@@ -15,11 +15,20 @@ | |||
package facet | |||
import ( | |||
"reflect" | |||
"sort" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeTermsFacetBuilder int | |||
func init() { | |||
var tfb TermsFacetBuilder | |||
reflectStaticSizeTermsFacetBuilder = int(reflect.TypeOf(tfb).Size()) | |||
} | |||
type TermsFacetBuilder struct { | |||
size int | |||
field string | |||
@@ -37,6 +46,18 @@ func NewTermsFacetBuilder(field string, size int) *TermsFacetBuilder { | |||
} | |||
} | |||
func (fb *TermsFacetBuilder) Size() int { | |||
sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr + | |||
len(fb.field) | |||
for k, _ := range fb.termsCount { | |||
sizeInBytes += size.SizeOfString + len(k) + | |||
size.SizeOfInt | |||
} | |||
return sizeInBytes | |||
} | |||
func (fb *TermsFacetBuilder) Field() string { | |||
return fb.field | |||
} |
@@ -15,11 +15,32 @@ | |||
package search | |||
import ( | |||
"reflect" | |||
"sort" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeFacetsBuilder int | |||
var reflectStaticSizeFacetResult int | |||
var reflectStaticSizeTermFacet int | |||
var reflectStaticSizeNumericRangeFacet int | |||
var reflectStaticSizeDateRangeFacet int | |||
func init() { | |||
var fb FacetsBuilder | |||
reflectStaticSizeFacetsBuilder = int(reflect.TypeOf(fb).Size()) | |||
var fr FacetResult | |||
reflectStaticSizeFacetResult = int(reflect.TypeOf(fr).Size()) | |||
var tf TermFacet | |||
reflectStaticSizeTermFacet = int(reflect.TypeOf(tf).Size()) | |||
var nrf NumericRangeFacet | |||
reflectStaticSizeNumericRangeFacet = int(reflect.TypeOf(nrf).Size()) | |||
var drf DateRangeFacet | |||
reflectStaticSizeDateRangeFacet = int(reflect.TypeOf(drf).Size()) | |||
} | |||
type FacetBuilder interface { | |||
StartDoc() | |||
UpdateVisitor(field string, term []byte) | |||
@@ -27,23 +48,40 @@ type FacetBuilder interface { | |||
Result() *FacetResult | |||
Field() string | |||
Size() int | |||
} | |||
type FacetsBuilder struct { | |||
indexReader index.IndexReader | |||
facets map[string]FacetBuilder | |||
facetNames []string | |||
facets []FacetBuilder | |||
fields []string | |||
} | |||
func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder { | |||
return &FacetsBuilder{ | |||
indexReader: indexReader, | |||
facets: make(map[string]FacetBuilder, 0), | |||
} | |||
} | |||
func (fb *FacetsBuilder) Size() int { | |||
sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr | |||
for k, v := range fb.facets { | |||
sizeInBytes += size.SizeOfString + v.Size() + len(fb.facetNames[k]) | |||
} | |||
for _, entry := range fb.fields { | |||
sizeInBytes += size.SizeOfString + len(entry) | |||
} | |||
return sizeInBytes | |||
} | |||
func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) { | |||
fb.facets[name] = facetBuilder | |||
fb.facetNames = append(fb.facetNames, name) | |||
fb.facets = append(fb.facets, facetBuilder) | |||
fb.fields = append(fb.fields, facetBuilder.Field()) | |||
} | |||
@@ -213,6 +251,14 @@ type FacetResult struct { | |||
DateRanges DateRangeFacets `json:"date_ranges,omitempty"` | |||
} | |||
func (fr *FacetResult) Size() int { | |||
return reflectStaticSizeFacetResult + size.SizeOfPtr + | |||
len(fr.Field) + | |||
len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) + | |||
len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) + | |||
len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr) | |||
} | |||
func (fr *FacetResult) Merge(other *FacetResult) { | |||
fr.Total += other.Total | |||
fr.Missing += other.Missing | |||
@@ -287,9 +333,9 @@ func (fr FacetResults) Fixup(name string, size int) { | |||
func (fb *FacetsBuilder) Results() FacetResults { | |||
fr := make(FacetResults) | |||
for facetName, facetBuilder := range fb.facets { | |||
for i, facetBuilder := range fb.facets { | |||
facetResult := facetBuilder.Result() | |||
fr[facetName] = facetResult | |||
fr[fb.facetNames[i]] = facetResult | |||
} | |||
return fr | |||
} |
@@ -57,15 +57,24 @@ func LevenshteinDistance(a, b string) int { | |||
// in which case the first return val will be the max | |||
// and the second will be true, indicating max was exceeded | |||
func LevenshteinDistanceMax(a, b string, max int) (int, bool) { | |||
v, wasMax, _ := LevenshteinDistanceMaxReuseSlice(a, b, max, nil) | |||
return v, wasMax | |||
} | |||
func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, []int) { | |||
la := len(a) | |||
lb := len(b) | |||
ld := int(math.Abs(float64(la - lb))) | |||
if ld > max { | |||
return max, true | |||
return max, true, d | |||
} | |||
d := make([]int, la+1) | |||
if cap(d) < la+1 { | |||
d = make([]int, la+1) | |||
} | |||
d = d[:la+1] | |||
var lastdiag, olddiag, temp int | |||
for i := 1; i <= la; i++ { | |||
@@ -98,8 +107,8 @@ func LevenshteinDistanceMax(a, b string, max int) (int, bool) { | |||
} | |||
// after each row if rowmin isn't less than max stop | |||
if rowmin > max { | |||
return max, true | |||
return max, true, d | |||
} | |||
} | |||
return d[la], false | |||
return d[la], false, d | |||
} |
@@ -14,6 +14,17 @@ | |||
package search | |||
import ( | |||
"reflect" | |||
) | |||
var reflectStaticSizeDocumentMatchPool int | |||
func init() { | |||
var dmp DocumentMatchPool | |||
reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size()) | |||
} | |||
// DocumentMatchPoolTooSmall is a callback function that can be executed | |||
// when the DocumentMatchPool does not have sufficient capacity | |||
// By default we just perform just-in-time allocation, but you could log |
@@ -70,9 +70,11 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, | |||
} | |||
ss = append(ss, sr) | |||
} | |||
if len(ss) < 1 { | |||
return searcher.NewMatchNoneSearcher(i) | |||
} | |||
return searcher.NewConjunctionSearcher(i, ss, options) | |||
} | |||
@@ -58,7 +58,8 @@ func (q *DisjunctionQuery) SetMin(m float64) { | |||
q.Min = m | |||
} | |||
func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { | |||
func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, | |||
options search.SearcherOptions) (search.Searcher, error) { | |||
ss := make([]search.Searcher, 0, len(q.Disjuncts)) | |||
for _, disjunct := range q.Disjuncts { | |||
sr, err := disjunct.Searcher(i, m, options) | |||
@@ -76,9 +77,17 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, | |||
} | |||
ss = append(ss, sr) | |||
} | |||
if len(ss) < 1 { | |||
return searcher.NewMatchNoneSearcher(i) | |||
} else if len(ss) == 1 && int(q.Min) == ss[0].Min() { | |||
// apply optimization only if both conditions below are satisfied: | |||
// - disjunction searcher has only 1 child searcher | |||
// - parent searcher's min setting is equal to child searcher's min | |||
return ss[0], nil | |||
} | |||
return searcher.NewDisjunctionSearcher(i, ss, q.Min, options) | |||
} | |||
@@ -296,32 +296,28 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { | |||
} | |||
expand = func(query Query) (Query, error) { | |||
switch query.(type) { | |||
switch q := query.(type) { | |||
case *QueryStringQuery: | |||
q := query.(*QueryStringQuery) | |||
parsed, err := parseQuerySyntax(q.Query) | |||
if err != nil { | |||
return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err) | |||
} | |||
return expand(parsed) | |||
case *ConjunctionQuery: | |||
q := *query.(*ConjunctionQuery) | |||
children, err := expandSlice(q.Conjuncts) | |||
if err != nil { | |||
return nil, err | |||
} | |||
q.Conjuncts = children | |||
return &q, nil | |||
return q, nil | |||
case *DisjunctionQuery: | |||
q := *query.(*DisjunctionQuery) | |||
children, err := expandSlice(q.Disjuncts) | |||
if err != nil { | |||
return nil, err | |||
} | |||
q.Disjuncts = children | |||
return &q, nil | |||
return q, nil | |||
case *BooleanQuery: | |||
q := *query.(*BooleanQuery) | |||
var err error | |||
q.Must, err = expand(q.Must) | |||
if err != nil { | |||
@@ -335,7 +331,7 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { | |||
if err != nil { | |||
return nil, err | |||
} | |||
return &q, nil | |||
return q, nil | |||
default: | |||
return query, nil | |||
} |
@@ -273,6 +273,7 @@ func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) { | |||
// see where to go | |||
if !l.seenDot && next == '.' { | |||
// stay in this state | |||
l.seenDot = true | |||
l.buf += string(next) | |||
return inNumOrStrState, true | |||
} else if unicode.IsDigit(next) { |
@@ -15,7 +15,6 @@ | |||
package query | |||
import ( | |||
"regexp" | |||
"strings" | |||
"github.com/blevesearch/bleve/index" | |||
@@ -28,7 +27,6 @@ type RegexpQuery struct { | |||
Regexp string `json:"regexp"` | |||
FieldVal string `json:"field,omitempty"` | |||
BoostVal *Boost `json:"boost,omitempty"` | |||
compiled *regexp.Regexp | |||
} | |||
// NewRegexpQuery creates a new Query which finds | |||
@@ -64,33 +62,20 @@ func (q *RegexpQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, opti | |||
if q.FieldVal == "" { | |||
field = m.DefaultSearchField() | |||
} | |||
err := q.compile() | |||
if err != nil { | |||
return nil, err | |||
// require that pattern NOT be anchored to start and end of term. | |||
// do not attempt to remove trailing $, its presence is not | |||
// known to interfere with LiteralPrefix() the way ^ does | |||
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc | |||
actualRegexp := q.Regexp | |||
if strings.HasPrefix(actualRegexp, "^") { | |||
actualRegexp = actualRegexp[1:] // remove leading ^ | |||
} | |||
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options) | |||
return searcher.NewRegexpStringSearcher(i, actualRegexp, field, | |||
q.BoostVal.Value(), options) | |||
} | |||
func (q *RegexpQuery) Validate() error { | |||
return q.compile() | |||
} | |||
func (q *RegexpQuery) compile() error { | |||
if q.compiled == nil { | |||
// require that pattern NOT be anchored to start and end of term | |||
actualRegexp := q.Regexp | |||
if strings.HasPrefix(actualRegexp, "^") { | |||
actualRegexp = actualRegexp[1:] // remove leading ^ | |||
} | |||
// do not attempt to remove trailing $, it's presence is not | |||
// known to interfere with LiteralPrefix() the way ^ does | |||
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc | |||
var err error | |||
q.compiled, err = regexp.Compile(actualRegexp) | |||
if err != nil { | |||
return err | |||
} | |||
} | |||
return nil | |||
return nil // real validation delayed until searcher constructor | |||
} |
@@ -15,7 +15,6 @@ | |||
package query | |||
import ( | |||
"regexp" | |||
"strings" | |||
"github.com/blevesearch/bleve/index" | |||
@@ -47,7 +46,6 @@ type WildcardQuery struct { | |||
Wildcard string `json:"wildcard"` | |||
FieldVal string `json:"field,omitempty"` | |||
BoostVal *Boost `json:"boost,omitempty"` | |||
compiled *regexp.Regexp | |||
} | |||
// NewWildcardQuery creates a new Query which finds | |||
@@ -83,24 +81,13 @@ func (q *WildcardQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, op | |||
if q.FieldVal == "" { | |||
field = m.DefaultSearchField() | |||
} | |||
if q.compiled == nil { | |||
var err error | |||
q.compiled, err = q.convertToRegexp() | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options) | |||
} | |||
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) | |||
func (q *WildcardQuery) Validate() error { | |||
var err error | |||
q.compiled, err = q.convertToRegexp() | |||
return err | |||
return searcher.NewRegexpStringSearcher(i, regexpString, field, | |||
q.BoostVal.Value(), options) | |||
} | |||
func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) { | |||
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) | |||
return regexp.Compile(regexpString) | |||
func (q *WildcardQuery) Validate() error { | |||
return nil // real validation delayed until searcher constructor | |||
} |
@@ -15,13 +15,27 @@ | |||
package scorer | |||
import ( | |||
"reflect" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeConjunctionQueryScorer int | |||
func init() { | |||
var cqs ConjunctionQueryScorer | |||
reflectStaticSizeConjunctionQueryScorer = int(reflect.TypeOf(cqs).Size()) | |||
} | |||
type ConjunctionQueryScorer struct { | |||
options search.SearcherOptions | |||
} | |||
func (s *ConjunctionQueryScorer) Size() int { | |||
return reflectStaticSizeConjunctionQueryScorer + size.SizeOfPtr | |||
} | |||
func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer { | |||
return &ConjunctionQueryScorer{ | |||
options: options, | |||
@@ -35,15 +49,11 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ | |||
childrenExplanations = make([]*search.Explanation, len(constituents)) | |||
} | |||
locations := []search.FieldTermLocationMap{} | |||
for i, docMatch := range constituents { | |||
sum += docMatch.Score | |||
if s.options.Explain { | |||
childrenExplanations[i] = docMatch.Expl | |||
} | |||
if docMatch.Locations != nil { | |||
locations = append(locations, docMatch.Locations) | |||
} | |||
} | |||
newScore := sum | |||
var newExpl *search.Explanation | |||
@@ -55,11 +65,8 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ | |||
rv := constituents[0] | |||
rv.Score = newScore | |||
rv.Expl = newExpl | |||
if len(locations) == 1 { | |||
rv.Locations = locations[0] | |||
} else if len(locations) > 1 { | |||
rv.Locations = search.MergeLocations(locations) | |||
} | |||
rv.FieldTermLocations = search.MergeFieldTermLocations( | |||
rv.FieldTermLocations, constituents[1:]) | |||
return rv | |||
} |
@@ -16,11 +16,20 @@ package scorer | |||
import ( | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeConstantScorer int | |||
func init() { | |||
var cs ConstantScorer | |||
reflectStaticSizeConstantScorer = int(reflect.TypeOf(cs).Size()) | |||
} | |||
type ConstantScorer struct { | |||
constant float64 | |||
boost float64 | |||
@@ -30,6 +39,16 @@ type ConstantScorer struct { | |||
queryWeightExplanation *search.Explanation | |||
} | |||
func (s *ConstantScorer) Size() int { | |||
sizeInBytes := reflectStaticSizeConstantScorer + size.SizeOfPtr | |||
if s.queryWeightExplanation != nil { | |||
sizeInBytes += s.queryWeightExplanation.Size() | |||
} | |||
return sizeInBytes | |||
} | |||
func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer { | |||
rv := ConstantScorer{ | |||
options: options, |
@@ -16,14 +16,27 @@ package scorer | |||
import ( | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeDisjunctionQueryScorer int | |||
func init() { | |||
var dqs DisjunctionQueryScorer | |||
reflectStaticSizeDisjunctionQueryScorer = int(reflect.TypeOf(dqs).Size()) | |||
} | |||
type DisjunctionQueryScorer struct { | |||
options search.SearcherOptions | |||
} | |||
func (s *DisjunctionQueryScorer) Size() int { | |||
return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr | |||
} | |||
func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer { | |||
return &DisjunctionQueryScorer{ | |||
options: options, | |||
@@ -37,15 +50,11 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ | |||
childrenExplanations = make([]*search.Explanation, len(constituents)) | |||
} | |||
var locations []search.FieldTermLocationMap | |||
for i, docMatch := range constituents { | |||
sum += docMatch.Score | |||
if s.options.Explain { | |||
childrenExplanations[i] = docMatch.Expl | |||
} | |||
if docMatch.Locations != nil { | |||
locations = append(locations, docMatch.Locations) | |||
} | |||
} | |||
var rawExpl *search.Explanation | |||
@@ -67,11 +76,8 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ | |||
rv := constituents[0] | |||
rv.Score = newScore | |||
rv.Expl = newExpl | |||
if len(locations) == 1 { | |||
rv.Locations = locations[0] | |||
} else if len(locations) > 1 { | |||
rv.Locations = search.MergeLocations(locations) | |||
} | |||
rv.FieldTermLocations = search.MergeFieldTermLocations( | |||
rv.FieldTermLocations, constituents[1:]) | |||
return rv | |||
} |
@@ -17,13 +17,22 @@ package scorer | |||
import ( | |||
"fmt" | |||
"math" | |||
"reflect" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeTermQueryScorer int | |||
func init() { | |||
var tqs TermQueryScorer | |||
reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size()) | |||
} | |||
type TermQueryScorer struct { | |||
queryTerm []byte | |||
queryTerm string | |||
queryField string | |||
queryBoost float64 | |||
docTerm uint64 | |||
@@ -36,9 +45,24 @@ type TermQueryScorer struct { | |||
queryWeightExplanation *search.Explanation | |||
} | |||
func (s *TermQueryScorer) Size() int { | |||
sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr + | |||
len(s.queryTerm) + len(s.queryField) | |||
if s.idfExplanation != nil { | |||
sizeInBytes += s.idfExplanation.Size() | |||
} | |||
if s.queryWeightExplanation != nil { | |||
sizeInBytes += s.queryWeightExplanation.Size() | |||
} | |||
return sizeInBytes | |||
} | |||
func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer { | |||
rv := TermQueryScorer{ | |||
queryTerm: queryTerm, | |||
queryTerm: string(queryTerm), | |||
queryField: queryField, | |||
queryBoost: queryBoost, | |||
docTerm: docTerm, | |||
@@ -82,7 +106,7 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) { | |||
} | |||
s.queryWeightExplanation = &search.Explanation{ | |||
Value: s.queryWeight, | |||
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, string(s.queryTerm), s.queryBoost), | |||
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost), | |||
Children: childrenExplanations, | |||
} | |||
} | |||
@@ -104,7 +128,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term | |||
childrenExplanations := make([]*search.Explanation, 3) | |||
childrenExplanations[0] = &search.Explanation{ | |||
Value: tf, | |||
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, string(s.queryTerm), termMatch.Freq), | |||
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), | |||
} | |||
childrenExplanations[1] = &search.Explanation{ | |||
Value: termMatch.Norm, | |||
@@ -113,7 +137,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term | |||
childrenExplanations[2] = s.idfExplanation | |||
scoreExplanation = &search.Explanation{ | |||
Value: score, | |||
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, string(s.queryTerm), termMatch.ID), | |||
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID), | |||
Children: childrenExplanations, | |||
} | |||
} | |||
@@ -127,7 +151,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term | |||
childExplanations[1] = scoreExplanation | |||
scoreExplanation = &search.Explanation{ | |||
Value: score, | |||
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, string(s.queryTerm), s.queryBoost, termMatch.ID), | |||
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID), | |||
Children: childExplanations, | |||
} | |||
} | |||
@@ -140,41 +164,31 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term | |||
rv.Expl = scoreExplanation | |||
} | |||
if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 { | |||
locs := make([]search.Location, len(termMatch.Vectors)) | |||
locsUsed := 0 | |||
totalPositions := 0 | |||
for _, v := range termMatch.Vectors { | |||
totalPositions += len(v.ArrayPositions) | |||
if len(termMatch.Vectors) > 0 { | |||
if cap(rv.FieldTermLocations) < len(termMatch.Vectors) { | |||
rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors)) | |||
} | |||
positions := make(search.ArrayPositions, totalPositions) | |||
positionsUsed := 0 | |||
rv.Locations = make(search.FieldTermLocationMap) | |||
for _, v := range termMatch.Vectors { | |||
tlm := rv.Locations[v.Field] | |||
if tlm == nil { | |||
tlm = make(search.TermLocationMap) | |||
rv.Locations[v.Field] = tlm | |||
} | |||
loc := &locs[locsUsed] | |||
locsUsed++ | |||
loc.Pos = v.Pos | |||
loc.Start = v.Start | |||
loc.End = v.End | |||
var ap search.ArrayPositions | |||
if len(v.ArrayPositions) > 0 { | |||
loc.ArrayPositions = positions[positionsUsed : positionsUsed+len(v.ArrayPositions)] | |||
for i, ap := range v.ArrayPositions { | |||
loc.ArrayPositions[i] = ap | |||
n := len(rv.FieldTermLocations) | |||
if n < cap(rv.FieldTermLocations) { // reuse ap slice if available | |||
ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0] | |||
} | |||
positionsUsed += len(v.ArrayPositions) | |||
ap = append(ap, v.ArrayPositions...) | |||
} | |||
tlm[string(s.queryTerm)] = append(tlm[string(s.queryTerm)], loc) | |||
rv.FieldTermLocations = | |||
append(rv.FieldTermLocations, search.FieldTermLocation{ | |||
Field: v.Field, | |||
Term: s.queryTerm, | |||
Location: search.Location{ | |||
Pos: v.Pos, | |||
Start: v.Start, | |||
End: v.End, | |||
ArrayPositions: ap, | |||
}, | |||
}) | |||
} | |||
} | |||
@@ -16,11 +16,25 @@ package search | |||
import ( | |||
"fmt" | |||
"reflect" | |||
"github.com/blevesearch/bleve/document" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeDocumentMatch int | |||
var reflectStaticSizeSearchContext int | |||
var reflectStaticSizeLocation int | |||
func init() { | |||
var dm DocumentMatch | |||
reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size()) | |||
var sc SearchContext | |||
reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size()) | |||
var l Location | |||
reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) | |||
} | |||
type ArrayPositions []uint64 | |||
func (ap ArrayPositions) Equals(other ArrayPositions) bool { | |||
@@ -47,6 +61,11 @@ type Location struct { | |||
ArrayPositions ArrayPositions `json:"array_positions"` | |||
} | |||
func (l *Location) Size() int { | |||
return reflectStaticSizeLocation + size.SizeOfPtr + | |||
len(l.ArrayPositions)*size.SizeOfUint64 | |||
} | |||
type Locations []*Location | |||
type TermLocationMap map[string]Locations | |||
@@ -57,6 +76,12 @@ func (t TermLocationMap) AddLocation(term string, location *Location) { | |||
type FieldTermLocationMap map[string]TermLocationMap | |||
type FieldTermLocation struct { | |||
Field string | |||
Term string | |||
Location Location | |||
} | |||
type FieldFragmentMap map[string][]string | |||
type DocumentMatch struct { | |||
@@ -74,11 +99,14 @@ type DocumentMatch struct { | |||
// fields as float64s and date fields as time.RFC3339 formatted strings. | |||
Fields map[string]interface{} `json:"fields,omitempty"` | |||
// if we load the document for this hit, remember it so we dont load again | |||
Document *document.Document `json:"-"` | |||
// used to maintain natural index order | |||
HitNumber uint64 `json:"-"` | |||
// used to temporarily hold field term location information during | |||
// search processing in an efficient, recycle-friendly manner, to | |||
// be later incorporated into the Locations map when search | |||
// results are completed | |||
FieldTermLocations []FieldTermLocation `json:"-"` | |||
} | |||
func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { | |||
@@ -108,15 +136,116 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { | |||
indexInternalID := dm.IndexInternalID | |||
// remember the []interface{} used for sort | |||
sort := dm.Sort | |||
// remember the FieldTermLocations backing array | |||
ftls := dm.FieldTermLocations | |||
for i := range ftls { // recycle the ArrayPositions of each location | |||
ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0] | |||
} | |||
// idiom to copy over from empty DocumentMatch (0 allocations) | |||
*dm = DocumentMatch{} | |||
// reuse the []byte already allocated (and reset len to 0) | |||
dm.IndexInternalID = indexInternalID[:0] | |||
// reuse the []interface{} already allocated (and reset len to 0) | |||
dm.Sort = sort[:0] | |||
// reuse the FieldTermLocations already allocated (and reset len to 0) | |||
dm.FieldTermLocations = ftls[:0] | |||
return dm | |||
} | |||
func (dm *DocumentMatch) Size() int { | |||
sizeInBytes := reflectStaticSizeDocumentMatch + size.SizeOfPtr + | |||
len(dm.Index) + | |||
len(dm.ID) + | |||
len(dm.IndexInternalID) | |||
if dm.Expl != nil { | |||
sizeInBytes += dm.Expl.Size() | |||
} | |||
for k, v := range dm.Locations { | |||
sizeInBytes += size.SizeOfString + len(k) | |||
for k1, v1 := range v { | |||
sizeInBytes += size.SizeOfString + len(k1) + | |||
size.SizeOfSlice | |||
for _, entry := range v1 { | |||
sizeInBytes += entry.Size() | |||
} | |||
} | |||
} | |||
for k, v := range dm.Fragments { | |||
sizeInBytes += size.SizeOfString + len(k) + | |||
size.SizeOfSlice | |||
for _, entry := range v { | |||
sizeInBytes += size.SizeOfString + len(entry) | |||
} | |||
} | |||
for _, entry := range dm.Sort { | |||
sizeInBytes += size.SizeOfString + len(entry) | |||
} | |||
for k, _ := range dm.Fields { | |||
sizeInBytes += size.SizeOfString + len(k) + | |||
size.SizeOfPtr | |||
} | |||
return sizeInBytes | |||
} | |||
// Complete performs final preparation & transformation of the | |||
// DocumentMatch at the end of search processing, also allowing the | |||
// caller to provide an optional preallocated locations slice | |||
func (dm *DocumentMatch) Complete(prealloc []Location) []Location { | |||
// transform the FieldTermLocations slice into the Locations map | |||
nlocs := len(dm.FieldTermLocations) | |||
if nlocs > 0 { | |||
if cap(prealloc) < nlocs { | |||
prealloc = make([]Location, nlocs) | |||
} | |||
prealloc = prealloc[:nlocs] | |||
var lastField string | |||
var tlm TermLocationMap | |||
for i, ftl := range dm.FieldTermLocations { | |||
if lastField != ftl.Field { | |||
lastField = ftl.Field | |||
if dm.Locations == nil { | |||
dm.Locations = make(FieldTermLocationMap) | |||
} | |||
tlm = dm.Locations[ftl.Field] | |||
if tlm == nil { | |||
tlm = make(TermLocationMap) | |||
dm.Locations[ftl.Field] = tlm | |||
} | |||
} | |||
loc := &prealloc[i] | |||
*loc = ftl.Location | |||
if len(loc.ArrayPositions) > 0 { // copy | |||
loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...) | |||
} | |||
tlm[ftl.Term] = append(tlm[ftl.Term], loc) | |||
dm.FieldTermLocations[i] = FieldTermLocation{ // recycle | |||
Location: Location{ | |||
ArrayPositions: ftl.Location.ArrayPositions[:0], | |||
}, | |||
} | |||
} | |||
} | |||
dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle | |||
return prealloc | |||
} | |||
func (dm *DocumentMatch) String() string { | |||
return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score) | |||
} | |||
@@ -135,6 +264,7 @@ type Searcher interface { | |||
SetQueryNorm(float64) | |||
Count() uint64 | |||
Min() int | |||
Size() int | |||
DocumentMatchPoolSize() int | |||
} | |||
@@ -142,9 +272,26 @@ type Searcher interface { | |||
type SearcherOptions struct { | |||
Explain bool | |||
IncludeTermVectors bool | |||
Score string | |||
} | |||
// SearchContext represents the context around a single search | |||
type SearchContext struct { | |||
DocumentMatchPool *DocumentMatchPool | |||
Collector Collector | |||
} | |||
func (sc *SearchContext) Size() int { | |||
sizeInBytes := reflectStaticSizeSearchContext + size.SizeOfPtr + | |||
reflectStaticSizeDocumentMatchPool + size.SizeOfPtr | |||
if sc.DocumentMatchPool != nil { | |||
for _, entry := range sc.DocumentMatchPool.avail { | |||
if entry != nil { | |||
sizeInBytes += entry.Size() | |||
} | |||
} | |||
} | |||
return sizeInBytes | |||
} |
@@ -16,12 +16,21 @@ package searcher | |||
import ( | |||
"math" | |||
"reflect" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/search/scorer" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeBooleanSearcher int | |||
func init() { | |||
var bs BooleanSearcher | |||
reflectStaticSizeBooleanSearcher = int(reflect.TypeOf(bs).Size()) | |||
} | |||
type BooleanSearcher struct { | |||
indexReader index.IndexReader | |||
mustSearcher search.Searcher | |||
@@ -52,6 +61,32 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc | |||
return &rv, nil | |||
} | |||
func (s *BooleanSearcher) Size() int { | |||
sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr | |||
if s.mustSearcher != nil { | |||
sizeInBytes += s.mustSearcher.Size() | |||
} | |||
if s.shouldSearcher != nil { | |||
sizeInBytes += s.shouldSearcher.Size() | |||
} | |||
if s.mustNotSearcher != nil { | |||
sizeInBytes += s.mustNotSearcher.Size() | |||
} | |||
sizeInBytes += s.scorer.Size() | |||
for _, entry := range s.matches { | |||
if entry != nil { | |||
sizeInBytes += entry.Size() | |||
} | |||
} | |||
return sizeInBytes | |||
} | |||
func (s *BooleanSearcher) computeQueryNorm() { | |||
// first calculate sum of squared weights | |||
sumOfSquaredWeights := 0.0 | |||
@@ -284,6 +319,7 @@ func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch | |||
return nil, err | |||
} | |||
} | |||
return rv, nil | |||
} | |||
@@ -296,41 +332,52 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter | |||
} | |||
} | |||
var err error | |||
if s.mustSearcher != nil { | |||
if s.currMust != nil { | |||
ctx.DocumentMatchPool.Put(s.currMust) | |||
} | |||
s.currMust, err = s.mustSearcher.Advance(ctx, ID) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
if s.shouldSearcher != nil { | |||
if s.currShould != nil { | |||
ctx.DocumentMatchPool.Put(s.currShould) | |||
} | |||
s.currShould, err = s.shouldSearcher.Advance(ctx, ID) | |||
if err != nil { | |||
return nil, err | |||
// Advance the searchers only if the currentID cursor is trailing the lookup ID, | |||
// additionally if the mustNotSearcher has been initialized, ensure that the | |||
// cursor used to track the mustNotSearcher (currMustNot, which isn't tracked by | |||
// currentID) is trailing the lookup ID as well - for in the case where currentID | |||
// is nil and currMustNot is already at or ahead of the lookup ID, we MUST NOT | |||
// advance the currentID or the currMustNot cursors. | |||
if (s.currentID == nil || s.currentID.Compare(ID) < 0) && | |||
(s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0) { | |||
var err error | |||
if s.mustSearcher != nil { | |||
if s.currMust != nil { | |||
ctx.DocumentMatchPool.Put(s.currMust) | |||
} | |||
s.currMust, err = s.mustSearcher.Advance(ctx, ID) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
} | |||
if s.mustNotSearcher != nil { | |||
if s.currMustNot != nil { | |||
ctx.DocumentMatchPool.Put(s.currMustNot) | |||
if s.shouldSearcher != nil { | |||
if s.currShould != nil { | |||
ctx.DocumentMatchPool.Put(s.currShould) | |||
} | |||
s.currShould, err = s.shouldSearcher.Advance(ctx, ID) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) | |||
if err != nil { | |||
return nil, err | |||
if s.mustNotSearcher != nil { | |||
if s.currMustNot != nil { | |||
ctx.DocumentMatchPool.Put(s.currMustNot) | |||
} | |||
s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
} | |||
if s.mustSearcher != nil && s.currMust != nil { | |||
s.currentID = s.currMust.IndexInternalID | |||
} else if s.mustSearcher == nil && s.currShould != nil { | |||
s.currentID = s.currShould.IndexInternalID | |||
} else { | |||
s.currentID = nil | |||
if s.mustSearcher != nil && s.currMust != nil { | |||
s.currentID = s.currMust.IndexInternalID | |||
} else if s.mustSearcher == nil && s.currShould != nil { | |||
s.currentID = s.currShould.IndexInternalID | |||
} else { | |||
s.currentID = nil | |||
} | |||
} | |||
return s.Next(ctx) |
@@ -16,13 +16,22 @@ package searcher | |||
import ( | |||
"math" | |||
"reflect" | |||
"sort" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/search/scorer" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeConjunctionSearcher int | |||
func init() { | |||
var cs ConjunctionSearcher | |||
reflectStaticSizeConjunctionSearcher = int(reflect.TypeOf(cs).Size()) | |||
} | |||
type ConjunctionSearcher struct { | |||
indexReader index.IndexReader | |||
searchers OrderedSearcherList | |||
@@ -34,14 +43,27 @@ type ConjunctionSearcher struct { | |||
options search.SearcherOptions | |||
} | |||
func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, options search.SearcherOptions) (*ConjunctionSearcher, error) { | |||
// build the downstream searchers | |||
func NewConjunctionSearcher(indexReader index.IndexReader, | |||
qsearchers []search.Searcher, options search.SearcherOptions) ( | |||
search.Searcher, error) { | |||
// build the sorted downstream searchers | |||
searchers := make(OrderedSearcherList, len(qsearchers)) | |||
for i, searcher := range qsearchers { | |||
searchers[i] = searcher | |||
} | |||
// sort the searchers | |||
sort.Sort(searchers) | |||
// attempt the "unadorned" conjunction optimization only when we | |||
// do not need extra information like freq-norm's or term vectors | |||
if len(searchers) > 1 && | |||
options.Score == "none" && !options.IncludeTermVectors { | |||
rv, err := optimizeCompositeSearcher("conjunction:unadorned", | |||
indexReader, searchers, options) | |||
if err != nil || rv != nil { | |||
return rv, err | |||
} | |||
} | |||
// build our searcher | |||
rv := ConjunctionSearcher{ | |||
indexReader: indexReader, | |||
@@ -51,9 +73,36 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S | |||
scorer: scorer.NewConjunctionQueryScorer(options), | |||
} | |||
rv.computeQueryNorm() | |||
// attempt push-down conjunction optimization when there's >1 searchers | |||
if len(searchers) > 1 { | |||
rv, err := optimizeCompositeSearcher("conjunction", | |||
indexReader, searchers, options) | |||
if err != nil || rv != nil { | |||
return rv, err | |||
} | |||
} | |||
return &rv, nil | |||
} | |||
func (s *ConjunctionSearcher) Size() int { | |||
sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr + | |||
s.scorer.Size() | |||
for _, entry := range s.searchers { | |||
sizeInBytes += entry.Size() | |||
} | |||
for _, entry := range s.currs { | |||
if entry != nil { | |||
sizeInBytes += entry.Size() | |||
} | |||
} | |||
return sizeInBytes | |||
} | |||
func (s *ConjunctionSearcher) computeQueryNorm() { | |||
// first calculate sum of squared weights | |||
sumOfSquaredWeights := 0.0 | |||
@@ -108,7 +157,7 @@ func (s *ConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentM | |||
var rv *search.DocumentMatch | |||
var err error | |||
OUTER: | |||
for s.currs[s.maxIDIdx] != nil { | |||
for s.maxIDIdx < len(s.currs) && s.currs[s.maxIDIdx] != nil { | |||
maxID := s.currs[s.maxIDIdx].IndexInternalID | |||
i := 0 |
@@ -1,4 +1,4 @@ | |||
// Copyright (c) 2014 Couchbase, Inc. | |||
// Copyright (c) 2018 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
@@ -16,12 +16,9 @@ package searcher | |||
import ( | |||
"fmt" | |||
"math" | |||
"sort" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/search/scorer" | |||
) | |||
// DisjunctionMaxClauseCount is a compile time setting that applications can | |||
@@ -29,246 +26,84 @@ import ( | |||
// error instead of exeucting searches when the size exceeds this value. | |||
var DisjunctionMaxClauseCount = 0 | |||
type DisjunctionSearcher struct { | |||
indexReader index.IndexReader | |||
searchers OrderedSearcherList | |||
numSearchers int | |||
queryNorm float64 | |||
currs []*search.DocumentMatch | |||
scorer *scorer.DisjunctionQueryScorer | |||
min int | |||
matching []*search.DocumentMatch | |||
matchingIdxs []int | |||
initialized bool | |||
} | |||
func tooManyClauses(count int) bool { | |||
if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount { | |||
return true | |||
} | |||
return false | |||
} | |||
func tooManyClausesErr() error { | |||
return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]", | |||
DisjunctionMaxClauseCount) | |||
} | |||
// DisjunctionHeapTakeover is a compile time setting that applications can | |||
// adjust to control when the DisjunctionSearcher will switch from a simple | |||
// slice implementation to a heap implementation. | |||
var DisjunctionHeapTakeover = 10 | |||
func NewDisjunctionSearcher(indexReader index.IndexReader, | |||
qsearchers []search.Searcher, min float64, options search.SearcherOptions) ( | |||
*DisjunctionSearcher, error) { | |||
return newDisjunctionSearcher(indexReader, qsearchers, min, options, | |||
true) | |||
search.Searcher, error) { | |||
return newDisjunctionSearcher(indexReader, qsearchers, min, options, true) | |||
} | |||
func newDisjunctionSearcher(indexReader index.IndexReader, | |||
qsearchers []search.Searcher, min float64, options search.SearcherOptions, | |||
limit bool) ( | |||
*DisjunctionSearcher, error) { | |||
if limit && tooManyClauses(len(qsearchers)) { | |||
return nil, tooManyClausesErr() | |||
} | |||
// build the downstream searchers | |||
searchers := make(OrderedSearcherList, len(qsearchers)) | |||
for i, searcher := range qsearchers { | |||
searchers[i] = searcher | |||
} | |||
// sort the searchers | |||
sort.Sort(sort.Reverse(searchers)) | |||
// build our searcher | |||
rv := DisjunctionSearcher{ | |||
indexReader: indexReader, | |||
searchers: searchers, | |||
numSearchers: len(searchers), | |||
currs: make([]*search.DocumentMatch, len(searchers)), | |||
scorer: scorer.NewDisjunctionQueryScorer(options), | |||
min: int(min), | |||
matching: make([]*search.DocumentMatch, len(searchers)), | |||
matchingIdxs: make([]int, len(searchers)), | |||
} | |||
rv.computeQueryNorm() | |||
return &rv, nil | |||
} | |||
func (s *DisjunctionSearcher) computeQueryNorm() { | |||
// first calculate sum of squared weights | |||
sumOfSquaredWeights := 0.0 | |||
for _, searcher := range s.searchers { | |||
sumOfSquaredWeights += searcher.Weight() | |||
} | |||
// now compute query norm from this | |||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) | |||
// finally tell all the downstream searchers the norm | |||
for _, searcher := range s.searchers { | |||
searcher.SetQueryNorm(s.queryNorm) | |||
} | |||
} | |||
func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error { | |||
var err error | |||
// get all searchers pointing at their first match | |||
for i, searcher := range s.searchers { | |||
if s.currs[i] != nil { | |||
ctx.DocumentMatchPool.Put(s.currs[i]) | |||
} | |||
s.currs[i], err = searcher.Next(ctx) | |||
if err != nil { | |||
return err | |||
limit bool) (search.Searcher, error) { | |||
// attempt the "unadorned" disjunction optimization only when we | |||
// do not need extra information like freq-norm's or term vectors | |||
// and the requested min is simple | |||
if len(qsearchers) > 1 && min <= 1 && | |||
options.Score == "none" && !options.IncludeTermVectors { | |||
rv, err := optimizeCompositeSearcher("disjunction:unadorned", | |||
indexReader, qsearchers, options) | |||
if err != nil || rv != nil { | |||
return rv, err | |||
} | |||
} | |||
err = s.updateMatches() | |||
if err != nil { | |||
return err | |||
if len(qsearchers) > DisjunctionHeapTakeover { | |||
return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options, | |||
limit) | |||
} | |||
s.initialized = true | |||
return nil | |||
return newDisjunctionSliceSearcher(indexReader, qsearchers, min, options, | |||
limit) | |||
} | |||
func (s *DisjunctionSearcher) updateMatches() error { | |||
matching := s.matching[:0] | |||
matchingIdxs := s.matchingIdxs[:0] | |||
for i := 0; i < len(s.currs); i++ { | |||
curr := s.currs[i] | |||
if curr == nil { | |||
continue | |||
} | |||
if len(matching) > 0 { | |||
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) | |||
if cmp > 0 { | |||
continue | |||
} | |||
func optimizeCompositeSearcher(optimizationKind string, | |||
indexReader index.IndexReader, qsearchers []search.Searcher, | |||
options search.SearcherOptions) (search.Searcher, error) { | |||
var octx index.OptimizableContext | |||
if cmp < 0 { | |||
matching = matching[:0] | |||
matchingIdxs = matchingIdxs[:0] | |||
} | |||
for _, searcher := range qsearchers { | |||
o, ok := searcher.(index.Optimizable) | |||
if !ok { | |||
return nil, nil | |||
} | |||
matching = append(matching, curr) | |||
matchingIdxs = append(matchingIdxs, i) | |||
} | |||
s.matching = matching | |||
s.matchingIdxs = matchingIdxs | |||
return nil | |||
} | |||
func (s *DisjunctionSearcher) Weight() float64 { | |||
var rv float64 | |||
for _, searcher := range s.searchers { | |||
rv += searcher.Weight() | |||
} | |||
return rv | |||
} | |||
func (s *DisjunctionSearcher) SetQueryNorm(qnorm float64) { | |||
for _, searcher := range s.searchers { | |||
searcher.SetQueryNorm(qnorm) | |||
} | |||
} | |||
func (s *DisjunctionSearcher) Next(ctx *search.SearchContext) ( | |||
*search.DocumentMatch, error) { | |||
if !s.initialized { | |||
err := s.initSearchers(ctx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
var err error | |||
var rv *search.DocumentMatch | |||
found := false | |||
for !found && len(s.matching) > 0 { | |||
if len(s.matching) >= s.min { | |||
found = true | |||
// score this match | |||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) | |||
} | |||
// invoke next on all the matching searchers | |||
for _, i := range s.matchingIdxs { | |||
searcher := s.searchers[i] | |||
if s.currs[i] != rv { | |||
ctx.DocumentMatchPool.Put(s.currs[i]) | |||
} | |||
s.currs[i], err = searcher.Next(ctx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
err = s.updateMatches() | |||
var err error | |||
octx, err = o.Optimize(optimizationKind, octx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
return rv, nil | |||
} | |||
func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext, | |||
ID index.IndexInternalID) (*search.DocumentMatch, error) { | |||
if !s.initialized { | |||
err := s.initSearchers(ctx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
// get all searchers pointing at their first match | |||
var err error | |||
for i, searcher := range s.searchers { | |||
if s.currs[i] != nil { | |||
if s.currs[i].IndexInternalID.Compare(ID) >= 0 { | |||
continue | |||
} | |||
ctx.DocumentMatchPool.Put(s.currs[i]) | |||
} | |||
s.currs[i], err = searcher.Advance(ctx, ID) | |||
if err != nil { | |||
return nil, err | |||
if octx == nil { | |||
return nil, nil | |||
} | |||
} | |||
err = s.updateMatches() | |||
if err != nil { | |||
optimized, err := octx.Finish() | |||
if err != nil || optimized == nil { | |||
return nil, err | |||
} | |||
return s.Next(ctx) | |||
} | |||
func (s *DisjunctionSearcher) Count() uint64 { | |||
// for now return a worst case | |||
var sum uint64 | |||
for _, searcher := range s.searchers { | |||
sum += searcher.Count() | |||
tfr, ok := optimized.(index.TermFieldReader) | |||
if !ok { | |||
return nil, nil | |||
} | |||
return sum | |||
} | |||
func (s *DisjunctionSearcher) Close() (rv error) { | |||
for _, searcher := range s.searchers { | |||
err := searcher.Close() | |||
if err != nil && rv == nil { | |||
rv = err | |||
} | |||
} | |||
return rv | |||
return newTermSearcherFromReader(indexReader, tfr, | |||
[]byte(optimizationKind), "*", 1.0, options) | |||
} | |||
func (s *DisjunctionSearcher) Min() int { | |||
return s.min | |||
func tooManyClauses(count int) bool { | |||
if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount { | |||
return true | |||
} | |||
return false | |||
} | |||
func (s *DisjunctionSearcher) DocumentMatchPoolSize() int { | |||
rv := len(s.currs) | |||
for _, s := range s.searchers { | |||
rv += s.DocumentMatchPoolSize() | |||
} | |||
return rv | |||
func tooManyClausesErr(count int) error { | |||
return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]", | |||
count, DisjunctionMaxClauseCount) | |||
} |
@@ -0,0 +1,343 @@ | |||
// Copyright (c) 2018 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package searcher | |||
import ( | |||
"bytes" | |||
"container/heap" | |||
"math" | |||
"reflect" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/search/scorer" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeDisjunctionHeapSearcher int | |||
var reflectStaticSizeSearcherCurr int | |||
func init() { | |||
var dhs DisjunctionHeapSearcher | |||
reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size()) | |||
var sc SearcherCurr | |||
reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size()) | |||
} | |||
type SearcherCurr struct { | |||
searcher search.Searcher | |||
curr *search.DocumentMatch | |||
} | |||
type DisjunctionHeapSearcher struct { | |||
indexReader index.IndexReader | |||
numSearchers int | |||
scorer *scorer.DisjunctionQueryScorer | |||
min int | |||
queryNorm float64 | |||
initialized bool | |||
searchers []search.Searcher | |||
heap []*SearcherCurr | |||
matching []*search.DocumentMatch | |||
matchingCurrs []*SearcherCurr | |||
} | |||
func newDisjunctionHeapSearcher(indexReader index.IndexReader, | |||
searchers []search.Searcher, min float64, options search.SearcherOptions, | |||
limit bool) ( | |||
*DisjunctionHeapSearcher, error) { | |||
if limit && tooManyClauses(len(searchers)) { | |||
return nil, tooManyClausesErr(len(searchers)) | |||
} | |||
// build our searcher | |||
rv := DisjunctionHeapSearcher{ | |||
indexReader: indexReader, | |||
searchers: searchers, | |||
numSearchers: len(searchers), | |||
scorer: scorer.NewDisjunctionQueryScorer(options), | |||
min: int(min), | |||
matching: make([]*search.DocumentMatch, len(searchers)), | |||
matchingCurrs: make([]*SearcherCurr, len(searchers)), | |||
heap: make([]*SearcherCurr, 0, len(searchers)), | |||
} | |||
rv.computeQueryNorm() | |||
return &rv, nil | |||
} | |||
func (s *DisjunctionHeapSearcher) Size() int { | |||
sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr + | |||
s.scorer.Size() | |||
for _, entry := range s.searchers { | |||
sizeInBytes += entry.Size() | |||
} | |||
for _, entry := range s.matching { | |||
if entry != nil { | |||
sizeInBytes += entry.Size() | |||
} | |||
} | |||
// for matchingCurrs and heap, just use static size * len | |||
// since searchers and document matches already counted above | |||
sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr | |||
sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr | |||
return sizeInBytes | |||
} | |||
func (s *DisjunctionHeapSearcher) computeQueryNorm() { | |||
// first calculate sum of squared weights | |||
sumOfSquaredWeights := 0.0 | |||
for _, searcher := range s.searchers { | |||
sumOfSquaredWeights += searcher.Weight() | |||
} | |||
// now compute query norm from this | |||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) | |||
// finally tell all the downstream searchers the norm | |||
for _, searcher := range s.searchers { | |||
searcher.SetQueryNorm(s.queryNorm) | |||
} | |||
} | |||
func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error { | |||
// alloc a single block of SearcherCurrs | |||
block := make([]SearcherCurr, len(s.searchers)) | |||
// get all searchers pointing at their first match | |||
for i, searcher := range s.searchers { | |||
curr, err := searcher.Next(ctx) | |||
if err != nil { | |||
return err | |||
} | |||
if curr != nil { | |||
block[i].searcher = searcher | |||
block[i].curr = curr | |||
heap.Push(s, &block[i]) | |||
} | |||
} | |||
err := s.updateMatches() | |||
if err != nil { | |||
return err | |||
} | |||
s.initialized = true | |||
return nil | |||
} | |||
func (s *DisjunctionHeapSearcher) updateMatches() error { | |||
matching := s.matching[:0] | |||
matchingCurrs := s.matchingCurrs[:0] | |||
if len(s.heap) > 0 { | |||
// top of the heap is our next hit | |||
next := heap.Pop(s).(*SearcherCurr) | |||
matching = append(matching, next.curr) | |||
matchingCurrs = append(matchingCurrs, next) | |||
// now as long as top of heap matches, keep popping | |||
for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { | |||
next = heap.Pop(s).(*SearcherCurr) | |||
matching = append(matching, next.curr) | |||
matchingCurrs = append(matchingCurrs, next) | |||
} | |||
} | |||
s.matching = matching | |||
s.matchingCurrs = matchingCurrs | |||
return nil | |||
} | |||
func (s *DisjunctionHeapSearcher) Weight() float64 { | |||
var rv float64 | |||
for _, searcher := range s.searchers { | |||
rv += searcher.Weight() | |||
} | |||
return rv | |||
} | |||
func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) { | |||
for _, searcher := range s.searchers { | |||
searcher.SetQueryNorm(qnorm) | |||
} | |||
} | |||
func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) ( | |||
*search.DocumentMatch, error) { | |||
if !s.initialized { | |||
err := s.initSearchers(ctx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
var rv *search.DocumentMatch | |||
found := false | |||
for !found && len(s.matching) > 0 { | |||
if len(s.matching) >= s.min { | |||
found = true | |||
// score this match | |||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) | |||
} | |||
// invoke next on all the matching searchers | |||
for _, matchingCurr := range s.matchingCurrs { | |||
if matchingCurr.curr != rv { | |||
ctx.DocumentMatchPool.Put(matchingCurr.curr) | |||
} | |||
curr, err := matchingCurr.searcher.Next(ctx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
if curr != nil { | |||
matchingCurr.curr = curr | |||
heap.Push(s, matchingCurr) | |||
} | |||
} | |||
err := s.updateMatches() | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
return rv, nil | |||
} | |||
func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, | |||
ID index.IndexInternalID) (*search.DocumentMatch, error) { | |||
if !s.initialized { | |||
err := s.initSearchers(ctx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
// if there is anything in matching, toss it back onto the heap | |||
for _, matchingCurr := range s.matchingCurrs { | |||
heap.Push(s, matchingCurr) | |||
} | |||
s.matching = s.matching[:0] | |||
s.matchingCurrs = s.matchingCurrs[:0] | |||
// find all searchers that actually need to be advanced | |||
// advance them, using s.matchingCurrs as temp storage | |||
for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 { | |||
searcherCurr := heap.Pop(s).(*SearcherCurr) | |||
ctx.DocumentMatchPool.Put(searcherCurr.curr) | |||
curr, err := searcherCurr.searcher.Advance(ctx, ID) | |||
if err != nil { | |||
return nil, err | |||
} | |||
if curr != nil { | |||
searcherCurr.curr = curr | |||
s.matchingCurrs = append(s.matchingCurrs, searcherCurr) | |||
} | |||
} | |||
// now all of the searchers that we advanced have to be pushed back | |||
for _, matchingCurr := range s.matchingCurrs { | |||
heap.Push(s, matchingCurr) | |||
} | |||
// reset our temp space | |||
s.matchingCurrs = s.matchingCurrs[:0] | |||
err := s.updateMatches() | |||
if err != nil { | |||
return nil, err | |||
} | |||
return s.Next(ctx) | |||
} | |||
func (s *DisjunctionHeapSearcher) Count() uint64 { | |||
// for now return a worst case | |||
var sum uint64 | |||
for _, searcher := range s.searchers { | |||
sum += searcher.Count() | |||
} | |||
return sum | |||
} | |||
func (s *DisjunctionHeapSearcher) Close() (rv error) { | |||
for _, searcher := range s.searchers { | |||
err := searcher.Close() | |||
if err != nil && rv == nil { | |||
rv = err | |||
} | |||
} | |||
return rv | |||
} | |||
func (s *DisjunctionHeapSearcher) Min() int { | |||
return s.min | |||
} | |||
func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int { | |||
rv := len(s.searchers) | |||
for _, s := range s.searchers { | |||
rv += s.DocumentMatchPoolSize() | |||
} | |||
return rv | |||
} | |||
// a disjunction searcher implements the index.Optimizable interface | |||
// but only activates on an edge case where the disjunction is a | |||
// wrapper around a single Optimizable child searcher | |||
func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) ( | |||
index.OptimizableContext, error) { | |||
if len(s.searchers) == 1 { | |||
o, ok := s.searchers[0].(index.Optimizable) | |||
if ok { | |||
return o.Optimize(kind, octx) | |||
} | |||
} | |||
return octx, nil | |||
} | |||
// heap impl | |||
func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) } | |||
func (s *DisjunctionHeapSearcher) Less(i, j int) bool { | |||
if s.heap[i].curr == nil { | |||
return true | |||
} else if s.heap[j].curr == nil { | |||
return false | |||
} | |||
return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0 | |||
} | |||
func (s *DisjunctionHeapSearcher) Swap(i, j int) { | |||
s.heap[i], s.heap[j] = s.heap[j], s.heap[i] | |||
} | |||
func (s *DisjunctionHeapSearcher) Push(x interface{}) { | |||
s.heap = append(s.heap, x.(*SearcherCurr)) | |||
} | |||
func (s *DisjunctionHeapSearcher) Pop() interface{} { | |||
old := s.heap | |||
n := len(old) | |||
x := old[n-1] | |||
s.heap = old[0 : n-1] | |||
return x | |||
} |
@@ -0,0 +1,298 @@ | |||
// Copyright (c) 2018 Couchbase, Inc. | |||
// | |||
// Licensed under the Apache License, Version 2.0 (the "License"); | |||
// you may not use this file except in compliance with the License. | |||
// You may obtain a copy of the License at | |||
// | |||
// http://www.apache.org/licenses/LICENSE-2.0 | |||
// | |||
// Unless required by applicable law or agreed to in writing, software | |||
// distributed under the License is distributed on an "AS IS" BASIS, | |||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
// See the License for the specific language governing permissions and | |||
// limitations under the License. | |||
package searcher | |||
import ( | |||
"math" | |||
"reflect" | |||
"sort" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/search/scorer" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeDisjunctionSliceSearcher int | |||
func init() { | |||
var ds DisjunctionSliceSearcher | |||
reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size()) | |||
} | |||
type DisjunctionSliceSearcher struct { | |||
indexReader index.IndexReader | |||
searchers OrderedSearcherList | |||
numSearchers int | |||
queryNorm float64 | |||
currs []*search.DocumentMatch | |||
scorer *scorer.DisjunctionQueryScorer | |||
min int | |||
matching []*search.DocumentMatch | |||
matchingIdxs []int | |||
initialized bool | |||
} | |||
func newDisjunctionSliceSearcher(indexReader index.IndexReader, | |||
qsearchers []search.Searcher, min float64, options search.SearcherOptions, | |||
limit bool) ( | |||
*DisjunctionSliceSearcher, error) { | |||
if limit && tooManyClauses(len(qsearchers)) { | |||
return nil, tooManyClausesErr(len(qsearchers)) | |||
} | |||
// build the downstream searchers | |||
searchers := make(OrderedSearcherList, len(qsearchers)) | |||
for i, searcher := range qsearchers { | |||
searchers[i] = searcher | |||
} | |||
// sort the searchers | |||
sort.Sort(sort.Reverse(searchers)) | |||
// build our searcher | |||
rv := DisjunctionSliceSearcher{ | |||
indexReader: indexReader, | |||
searchers: searchers, | |||
numSearchers: len(searchers), | |||
currs: make([]*search.DocumentMatch, len(searchers)), | |||
scorer: scorer.NewDisjunctionQueryScorer(options), | |||
min: int(min), | |||
matching: make([]*search.DocumentMatch, len(searchers)), | |||
matchingIdxs: make([]int, len(searchers)), | |||
} | |||
rv.computeQueryNorm() | |||
return &rv, nil | |||
} | |||
func (s *DisjunctionSliceSearcher) Size() int { | |||
sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr + | |||
s.scorer.Size() | |||
for _, entry := range s.searchers { | |||
sizeInBytes += entry.Size() | |||
} | |||
for _, entry := range s.currs { | |||
if entry != nil { | |||
sizeInBytes += entry.Size() | |||
} | |||
} | |||
for _, entry := range s.matching { | |||
if entry != nil { | |||
sizeInBytes += entry.Size() | |||
} | |||
} | |||
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt | |||
return sizeInBytes | |||
} | |||
func (s *DisjunctionSliceSearcher) computeQueryNorm() { | |||
// first calculate sum of squared weights | |||
sumOfSquaredWeights := 0.0 | |||
for _, searcher := range s.searchers { | |||
sumOfSquaredWeights += searcher.Weight() | |||
} | |||
// now compute query norm from this | |||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) | |||
// finally tell all the downstream searchers the norm | |||
for _, searcher := range s.searchers { | |||
searcher.SetQueryNorm(s.queryNorm) | |||
} | |||
} | |||
func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error { | |||
var err error | |||
// get all searchers pointing at their first match | |||
for i, searcher := range s.searchers { | |||
if s.currs[i] != nil { | |||
ctx.DocumentMatchPool.Put(s.currs[i]) | |||
} | |||
s.currs[i], err = searcher.Next(ctx) | |||
if err != nil { | |||
return err | |||
} | |||
} | |||
err = s.updateMatches() | |||
if err != nil { | |||
return err | |||
} | |||
s.initialized = true | |||
return nil | |||
} | |||
func (s *DisjunctionSliceSearcher) updateMatches() error { | |||
matching := s.matching[:0] | |||
matchingIdxs := s.matchingIdxs[:0] | |||
for i := 0; i < len(s.currs); i++ { | |||
curr := s.currs[i] | |||
if curr == nil { | |||
continue | |||
} | |||
if len(matching) > 0 { | |||
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) | |||
if cmp > 0 { | |||
continue | |||
} | |||
if cmp < 0 { | |||
matching = matching[:0] | |||
matchingIdxs = matchingIdxs[:0] | |||
} | |||
} | |||
matching = append(matching, curr) | |||
matchingIdxs = append(matchingIdxs, i) | |||
} | |||
s.matching = matching | |||
s.matchingIdxs = matchingIdxs | |||
return nil | |||
} | |||
func (s *DisjunctionSliceSearcher) Weight() float64 { | |||
var rv float64 | |||
for _, searcher := range s.searchers { | |||
rv += searcher.Weight() | |||
} | |||
return rv | |||
} | |||
func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) { | |||
for _, searcher := range s.searchers { | |||
searcher.SetQueryNorm(qnorm) | |||
} | |||
} | |||
func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( | |||
*search.DocumentMatch, error) { | |||
if !s.initialized { | |||
err := s.initSearchers(ctx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
var err error | |||
var rv *search.DocumentMatch | |||
found := false | |||
for !found && len(s.matching) > 0 { | |||
if len(s.matching) >= s.min { | |||
found = true | |||
// score this match | |||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) | |||
} | |||
// invoke next on all the matching searchers | |||
for _, i := range s.matchingIdxs { | |||
searcher := s.searchers[i] | |||
if s.currs[i] != rv { | |||
ctx.DocumentMatchPool.Put(s.currs[i]) | |||
} | |||
s.currs[i], err = searcher.Next(ctx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
err = s.updateMatches() | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
return rv, nil | |||
} | |||
func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext, | |||
ID index.IndexInternalID) (*search.DocumentMatch, error) { | |||
if !s.initialized { | |||
err := s.initSearchers(ctx) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
// get all searchers pointing at their first match | |||
var err error | |||
for i, searcher := range s.searchers { | |||
if s.currs[i] != nil { | |||
if s.currs[i].IndexInternalID.Compare(ID) >= 0 { | |||
continue | |||
} | |||
ctx.DocumentMatchPool.Put(s.currs[i]) | |||
} | |||
s.currs[i], err = searcher.Advance(ctx, ID) | |||
if err != nil { | |||
return nil, err | |||
} | |||
} | |||
err = s.updateMatches() | |||
if err != nil { | |||
return nil, err | |||
} | |||
return s.Next(ctx) | |||
} | |||
func (s *DisjunctionSliceSearcher) Count() uint64 { | |||
// for now return a worst case | |||
var sum uint64 | |||
for _, searcher := range s.searchers { | |||
sum += searcher.Count() | |||
} | |||
return sum | |||
} | |||
func (s *DisjunctionSliceSearcher) Close() (rv error) { | |||
for _, searcher := range s.searchers { | |||
err := searcher.Close() | |||
if err != nil && rv == nil { | |||
rv = err | |||
} | |||
} | |||
return rv | |||
} | |||
func (s *DisjunctionSliceSearcher) Min() int { | |||
return s.min | |||
} | |||
func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int { | |||
rv := len(s.currs) | |||
for _, s := range s.searchers { | |||
rv += s.DocumentMatchPoolSize() | |||
} | |||
return rv | |||
} | |||
// a disjunction searcher implements the index.Optimizable interface | |||
// but only activates on an edge case where the disjunction is a | |||
// wrapper around a single Optimizable child searcher | |||
func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) ( | |||
index.OptimizableContext, error) { | |||
if len(s.searchers) == 1 { | |||
o, ok := s.searchers[0].(index.Optimizable) | |||
if ok { | |||
return o.Optimize(kind, octx) | |||
} | |||
} | |||
return octx, nil | |||
} |
@@ -15,11 +15,21 @@ | |||
package searcher | |||
import ( | |||
"reflect" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/search/scorer" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeDocIDSearcher int | |||
func init() { | |||
var ds DocIDSearcher | |||
reflectStaticSizeDocIDSearcher = int(reflect.TypeOf(ds).Size()) | |||
} | |||
// DocIDSearcher returns documents matching a predefined set of identifiers. | |||
type DocIDSearcher struct { | |||
reader index.DocIDReader | |||
@@ -42,6 +52,12 @@ func NewDocIDSearcher(indexReader index.IndexReader, ids []string, boost float64 | |||
}, nil | |||
} | |||
func (s *DocIDSearcher) Size() int { | |||
return reflectStaticSizeDocIDSearcher + size.SizeOfPtr + | |||
s.reader.Size() + | |||
s.scorer.Size() | |||
} | |||
func (s *DocIDSearcher) Count() uint64 { | |||
return uint64(s.count) | |||
} |
@@ -15,10 +15,20 @@ | |||
package searcher | |||
import ( | |||
"reflect" | |||
"github.com/blevesearch/bleve/index" | |||
"github.com/blevesearch/bleve/search" | |||
"github.com/blevesearch/bleve/size" | |||
) | |||
var reflectStaticSizeFilteringSearcher int | |||
func init() { | |||
var fs FilteringSearcher | |||
reflectStaticSizeFilteringSearcher = int(reflect.TypeOf(fs).Size()) | |||
} | |||
// FilterFunc defines a function which can filter documents | |||
// returning true means keep the document | |||
// returning false means do not keep the document | |||
@@ -38,6 +48,11 @@ func NewFilteringSearcher(s search.Searcher, filter FilterFunc) *FilteringSearch | |||
} | |||
} | |||
func (f *FilteringSearcher) Size() int { | |||
return reflectStaticSizeFilteringSearcher + size.SizeOfPtr + | |||
f.child.Size() | |||
} | |||
func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { | |||
next, err := f.child.Next(ctx) | |||
for next != nil && err == nil { |