You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

new.go 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817
  1. // Copyright (c) 2018 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package zap
  15. import (
  16. "bytes"
  17. "encoding/binary"
  18. "math"
  19. "sort"
  20. "sync"
  21. "github.com/RoaringBitmap/roaring"
  22. index "github.com/blevesearch/bleve_index_api"
  23. segment "github.com/blevesearch/scorch_segment_api/v2"
  24. "github.com/blevesearch/vellum"
  25. "github.com/golang/snappy"
  26. )
  27. var NewSegmentBufferNumResultsBump int = 100
  28. var NewSegmentBufferNumResultsFactor float64 = 1.0
  29. var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
  30. // ValidateDocFields can be set by applications to perform additional checks
  31. // on fields in a document being added to a new segment, by default it does
  32. // nothing.
  33. // This API is experimental and may be removed at any time.
  34. var ValidateDocFields = func(field index.Field) error {
  35. return nil
  36. }
  37. var defaultChunkFactor uint32 = 1024
  38. // New creates an in-memory zap-encoded SegmentBase from a set of Documents
  39. func (z *ZapPlugin) New(results []index.Document) (
  40. segment.Segment, uint64, error) {
  41. return z.newWithChunkFactor(results, defaultChunkFactor)
  42. }
  43. func (*ZapPlugin) newWithChunkFactor(results []index.Document,
  44. chunkFactor uint32) (segment.Segment, uint64, error) {
  45. s := interimPool.Get().(*interim)
  46. var br bytes.Buffer
  47. if s.lastNumDocs > 0 {
  48. // use previous results to initialize the buf with an estimate
  49. // size, but note that the interim instance comes from a
  50. // global interimPool, so multiple scorch instances indexing
  51. // different docs can lead to low quality estimates
  52. estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
  53. NewSegmentBufferNumResultsFactor)
  54. estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
  55. NewSegmentBufferAvgBytesPerDocFactor)
  56. br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
  57. }
  58. s.results = results
  59. s.chunkFactor = chunkFactor
  60. s.w = NewCountHashWriter(&br)
  61. storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
  62. err := s.convert()
  63. if err != nil {
  64. return nil, uint64(0), err
  65. }
  66. sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor,
  67. s.FieldsMap, s.FieldsInv, uint64(len(results)),
  68. storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
  69. if err == nil && s.reset() == nil {
  70. s.lastNumDocs = len(results)
  71. s.lastOutSize = len(br.Bytes())
  72. interimPool.Put(s)
  73. }
  74. return sb, uint64(len(br.Bytes())), err
  75. }
  76. var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
  77. // interim holds temporary working data used while converting from
  78. // analysis results to a zap-encoded segment
  79. type interim struct {
  80. results []index.Document
  81. chunkFactor uint32
  82. w *CountHashWriter
  83. // FieldsMap adds 1 to field id to avoid zero value issues
  84. // name -> field id + 1
  85. FieldsMap map[string]uint16
  86. // FieldsInv is the inverse of FieldsMap
  87. // field id -> name
  88. FieldsInv []string
  89. // Term dictionaries for each field
  90. // field id -> term -> postings list id + 1
  91. Dicts []map[string]uint64
  92. // Terms for each field, where terms are sorted ascending
  93. // field id -> []term
  94. DictKeys [][]string
  95. // Fields whose IncludeDocValues is true
  96. // field id -> bool
  97. IncludeDocValues []bool
  98. // postings id -> bitmap of docNums
  99. Postings []*roaring.Bitmap
  100. // postings id -> freq/norm's, one for each docNum in postings
  101. FreqNorms [][]interimFreqNorm
  102. freqNormsBacking []interimFreqNorm
  103. // postings id -> locs, one for each freq
  104. Locs [][]interimLoc
  105. locsBacking []interimLoc
  106. numTermsPerPostingsList []int // key is postings list id
  107. numLocsPerPostingsList []int // key is postings list id
  108. builder *vellum.Builder
  109. builderBuf bytes.Buffer
  110. metaBuf bytes.Buffer
  111. tmp0 []byte
  112. tmp1 []byte
  113. lastNumDocs int
  114. lastOutSize int
  115. }
  116. func (s *interim) reset() (err error) {
  117. s.results = nil
  118. s.chunkFactor = 0
  119. s.w = nil
  120. s.FieldsMap = nil
  121. s.FieldsInv = nil
  122. for i := range s.Dicts {
  123. s.Dicts[i] = nil
  124. }
  125. s.Dicts = s.Dicts[:0]
  126. for i := range s.DictKeys {
  127. s.DictKeys[i] = s.DictKeys[i][:0]
  128. }
  129. s.DictKeys = s.DictKeys[:0]
  130. for i := range s.IncludeDocValues {
  131. s.IncludeDocValues[i] = false
  132. }
  133. s.IncludeDocValues = s.IncludeDocValues[:0]
  134. for _, idn := range s.Postings {
  135. idn.Clear()
  136. }
  137. s.Postings = s.Postings[:0]
  138. s.FreqNorms = s.FreqNorms[:0]
  139. for i := range s.freqNormsBacking {
  140. s.freqNormsBacking[i] = interimFreqNorm{}
  141. }
  142. s.freqNormsBacking = s.freqNormsBacking[:0]
  143. s.Locs = s.Locs[:0]
  144. for i := range s.locsBacking {
  145. s.locsBacking[i] = interimLoc{}
  146. }
  147. s.locsBacking = s.locsBacking[:0]
  148. s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
  149. s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
  150. s.builderBuf.Reset()
  151. if s.builder != nil {
  152. err = s.builder.Reset(&s.builderBuf)
  153. }
  154. s.metaBuf.Reset()
  155. s.tmp0 = s.tmp0[:0]
  156. s.tmp1 = s.tmp1[:0]
  157. s.lastNumDocs = 0
  158. s.lastOutSize = 0
  159. return err
  160. }
  161. func (s *interim) grabBuf(size int) []byte {
  162. buf := s.tmp0
  163. if cap(buf) < size {
  164. buf = make([]byte, size)
  165. s.tmp0 = buf
  166. }
  167. return buf[0:size]
  168. }
  169. type interimStoredField struct {
  170. vals [][]byte
  171. typs []byte
  172. arrayposs [][]uint64 // array positions
  173. }
  174. type interimFreqNorm struct {
  175. freq uint64
  176. norm float32
  177. numLocs int
  178. }
  179. type interimLoc struct {
  180. fieldID uint16
  181. pos uint64
  182. start uint64
  183. end uint64
  184. arrayposs []uint64
  185. }
  186. func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
  187. s.FieldsMap = map[string]uint16{}
  188. s.getOrDefineField("_id") // _id field is fieldID 0
  189. for _, result := range s.results {
  190. result.VisitComposite(func(field index.CompositeField) {
  191. s.getOrDefineField(field.Name())
  192. })
  193. result.VisitFields(func(field index.Field) {
  194. s.getOrDefineField(field.Name())
  195. })
  196. }
  197. sort.Strings(s.FieldsInv[1:]) // keep _id as first field
  198. for fieldID, fieldName := range s.FieldsInv {
  199. s.FieldsMap[fieldName] = uint16(fieldID + 1)
  200. }
  201. if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
  202. s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
  203. } else {
  204. s.IncludeDocValues = make([]bool, len(s.FieldsInv))
  205. }
  206. s.prepareDicts()
  207. for _, dict := range s.DictKeys {
  208. sort.Strings(dict)
  209. }
  210. s.processDocuments()
  211. storedIndexOffset, err := s.writeStoredFields()
  212. if err != nil {
  213. return 0, 0, 0, nil, err
  214. }
  215. var fdvIndexOffset uint64
  216. var dictOffsets []uint64
  217. if len(s.results) > 0 {
  218. fdvIndexOffset, dictOffsets, err = s.writeDicts()
  219. if err != nil {
  220. return 0, 0, 0, nil, err
  221. }
  222. } else {
  223. dictOffsets = make([]uint64, len(s.FieldsInv))
  224. }
  225. fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
  226. if err != nil {
  227. return 0, 0, 0, nil, err
  228. }
  229. return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
  230. }
  231. func (s *interim) getOrDefineField(fieldName string) int {
  232. fieldIDPlus1, exists := s.FieldsMap[fieldName]
  233. if !exists {
  234. fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
  235. s.FieldsMap[fieldName] = fieldIDPlus1
  236. s.FieldsInv = append(s.FieldsInv, fieldName)
  237. s.Dicts = append(s.Dicts, make(map[string]uint64))
  238. n := len(s.DictKeys)
  239. if n < cap(s.DictKeys) {
  240. s.DictKeys = s.DictKeys[:n+1]
  241. s.DictKeys[n] = s.DictKeys[n][:0]
  242. } else {
  243. s.DictKeys = append(s.DictKeys, []string(nil))
  244. }
  245. }
  246. return int(fieldIDPlus1 - 1)
  247. }
  248. // fill Dicts and DictKeys from analysis results
  249. func (s *interim) prepareDicts() {
  250. var pidNext int
  251. var totTFs int
  252. var totLocs int
  253. visitField := func(field index.Field) {
  254. fieldID := uint16(s.getOrDefineField(field.Name()))
  255. dict := s.Dicts[fieldID]
  256. dictKeys := s.DictKeys[fieldID]
  257. tfs := field.AnalyzedTokenFrequencies()
  258. for term, tf := range tfs {
  259. pidPlus1, exists := dict[term]
  260. if !exists {
  261. pidNext++
  262. pidPlus1 = uint64(pidNext)
  263. dict[term] = pidPlus1
  264. dictKeys = append(dictKeys, term)
  265. s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
  266. s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
  267. }
  268. pid := pidPlus1 - 1
  269. s.numTermsPerPostingsList[pid] += 1
  270. s.numLocsPerPostingsList[pid] += len(tf.Locations)
  271. totLocs += len(tf.Locations)
  272. }
  273. totTFs += len(tfs)
  274. s.DictKeys[fieldID] = dictKeys
  275. }
  276. for _, result := range s.results {
  277. // walk each composite field
  278. result.VisitComposite(func(field index.CompositeField) {
  279. visitField(field)
  280. })
  281. // walk each field
  282. result.VisitFields(visitField)
  283. }
  284. numPostingsLists := pidNext
  285. if cap(s.Postings) >= numPostingsLists {
  286. s.Postings = s.Postings[:numPostingsLists]
  287. } else {
  288. postings := make([]*roaring.Bitmap, numPostingsLists)
  289. copy(postings, s.Postings[:cap(s.Postings)])
  290. for i := 0; i < numPostingsLists; i++ {
  291. if postings[i] == nil {
  292. postings[i] = roaring.New()
  293. }
  294. }
  295. s.Postings = postings
  296. }
  297. if cap(s.FreqNorms) >= numPostingsLists {
  298. s.FreqNorms = s.FreqNorms[:numPostingsLists]
  299. } else {
  300. s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
  301. }
  302. if cap(s.freqNormsBacking) >= totTFs {
  303. s.freqNormsBacking = s.freqNormsBacking[:totTFs]
  304. } else {
  305. s.freqNormsBacking = make([]interimFreqNorm, totTFs)
  306. }
  307. freqNormsBacking := s.freqNormsBacking
  308. for pid, numTerms := range s.numTermsPerPostingsList {
  309. s.FreqNorms[pid] = freqNormsBacking[0:0]
  310. freqNormsBacking = freqNormsBacking[numTerms:]
  311. }
  312. if cap(s.Locs) >= numPostingsLists {
  313. s.Locs = s.Locs[:numPostingsLists]
  314. } else {
  315. s.Locs = make([][]interimLoc, numPostingsLists)
  316. }
  317. if cap(s.locsBacking) >= totLocs {
  318. s.locsBacking = s.locsBacking[:totLocs]
  319. } else {
  320. s.locsBacking = make([]interimLoc, totLocs)
  321. }
  322. locsBacking := s.locsBacking
  323. for pid, numLocs := range s.numLocsPerPostingsList {
  324. s.Locs[pid] = locsBacking[0:0]
  325. locsBacking = locsBacking[numLocs:]
  326. }
  327. }
  328. func (s *interim) processDocuments() {
  329. numFields := len(s.FieldsInv)
  330. reuseFieldLens := make([]int, numFields)
  331. reuseFieldTFs := make([]index.TokenFrequencies, numFields)
  332. for docNum, result := range s.results {
  333. for i := 0; i < numFields; i++ { // clear these for reuse
  334. reuseFieldLens[i] = 0
  335. reuseFieldTFs[i] = nil
  336. }
  337. s.processDocument(uint64(docNum), result,
  338. reuseFieldLens, reuseFieldTFs)
  339. }
  340. }
  341. func (s *interim) processDocument(docNum uint64,
  342. result index.Document,
  343. fieldLens []int, fieldTFs []index.TokenFrequencies) {
  344. visitField := func(field index.Field) {
  345. fieldID := uint16(s.getOrDefineField(field.Name()))
  346. fieldLens[fieldID] += field.AnalyzedLength()
  347. existingFreqs := fieldTFs[fieldID]
  348. if existingFreqs != nil {
  349. existingFreqs.MergeAll(field.Name(), field.AnalyzedTokenFrequencies())
  350. } else {
  351. fieldTFs[fieldID] = field.AnalyzedTokenFrequencies()
  352. }
  353. }
  354. // walk each composite field
  355. result.VisitComposite(func(field index.CompositeField) {
  356. visitField(field)
  357. })
  358. // walk each field
  359. result.VisitFields(visitField)
  360. // now that it's been rolled up into fieldTFs, walk that
  361. for fieldID, tfs := range fieldTFs {
  362. dict := s.Dicts[fieldID]
  363. norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))
  364. for term, tf := range tfs {
  365. pid := dict[term] - 1
  366. bs := s.Postings[pid]
  367. bs.Add(uint32(docNum))
  368. s.FreqNorms[pid] = append(s.FreqNorms[pid],
  369. interimFreqNorm{
  370. freq: uint64(tf.Frequency()),
  371. norm: norm,
  372. numLocs: len(tf.Locations),
  373. })
  374. if len(tf.Locations) > 0 {
  375. locs := s.Locs[pid]
  376. for _, loc := range tf.Locations {
  377. var locf = uint16(fieldID)
  378. if loc.Field != "" {
  379. locf = uint16(s.getOrDefineField(loc.Field))
  380. }
  381. var arrayposs []uint64
  382. if len(loc.ArrayPositions) > 0 {
  383. arrayposs = loc.ArrayPositions
  384. }
  385. locs = append(locs, interimLoc{
  386. fieldID: locf,
  387. pos: uint64(loc.Position),
  388. start: uint64(loc.Start),
  389. end: uint64(loc.End),
  390. arrayposs: arrayposs,
  391. })
  392. }
  393. s.Locs[pid] = locs
  394. }
  395. }
  396. }
  397. }
  398. func (s *interim) writeStoredFields() (
  399. storedIndexOffset uint64, err error) {
  400. varBuf := make([]byte, binary.MaxVarintLen64)
  401. metaEncode := func(val uint64) (int, error) {
  402. wb := binary.PutUvarint(varBuf, val)
  403. return s.metaBuf.Write(varBuf[:wb])
  404. }
  405. data, compressed := s.tmp0[:0], s.tmp1[:0]
  406. defer func() { s.tmp0, s.tmp1 = data, compressed }()
  407. // keyed by docNum
  408. docStoredOffsets := make([]uint64, len(s.results))
  409. // keyed by fieldID, for the current doc in the loop
  410. docStoredFields := map[uint16]interimStoredField{}
  411. for docNum, result := range s.results {
  412. for fieldID := range docStoredFields { // reset for next doc
  413. delete(docStoredFields, fieldID)
  414. }
  415. var validationErr error
  416. result.VisitFields(func(field index.Field) {
  417. fieldID := uint16(s.getOrDefineField(field.Name()))
  418. if field.Options().IsStored() {
  419. isf := docStoredFields[fieldID]
  420. isf.vals = append(isf.vals, field.Value())
  421. isf.typs = append(isf.typs, field.EncodedFieldType())
  422. isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
  423. docStoredFields[fieldID] = isf
  424. }
  425. if field.Options().IncludeDocValues() {
  426. s.IncludeDocValues[fieldID] = true
  427. }
  428. err := ValidateDocFields(field)
  429. if err != nil && validationErr == nil {
  430. validationErr = err
  431. }
  432. })
  433. if validationErr != nil {
  434. return 0, validationErr
  435. }
  436. var curr int
  437. s.metaBuf.Reset()
  438. data = data[:0]
  439. // _id field special case optimizes ExternalID() lookups
  440. idFieldVal := docStoredFields[uint16(0)].vals[0]
  441. _, err = metaEncode(uint64(len(idFieldVal)))
  442. if err != nil {
  443. return 0, err
  444. }
  445. // handle non-"_id" fields
  446. for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
  447. isf, exists := docStoredFields[uint16(fieldID)]
  448. if exists {
  449. curr, data, err = persistStoredFieldValues(
  450. fieldID, isf.vals, isf.typs, isf.arrayposs,
  451. curr, metaEncode, data)
  452. if err != nil {
  453. return 0, err
  454. }
  455. }
  456. }
  457. metaBytes := s.metaBuf.Bytes()
  458. compressed = snappy.Encode(compressed[:cap(compressed)], data)
  459. docStoredOffsets[docNum] = uint64(s.w.Count())
  460. _, err := writeUvarints(s.w,
  461. uint64(len(metaBytes)),
  462. uint64(len(idFieldVal)+len(compressed)))
  463. if err != nil {
  464. return 0, err
  465. }
  466. _, err = s.w.Write(metaBytes)
  467. if err != nil {
  468. return 0, err
  469. }
  470. _, err = s.w.Write(idFieldVal)
  471. if err != nil {
  472. return 0, err
  473. }
  474. _, err = s.w.Write(compressed)
  475. if err != nil {
  476. return 0, err
  477. }
  478. }
  479. storedIndexOffset = uint64(s.w.Count())
  480. for _, docStoredOffset := range docStoredOffsets {
  481. err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
  482. if err != nil {
  483. return 0, err
  484. }
  485. }
  486. return storedIndexOffset, nil
  487. }
  488. func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
  489. dictOffsets = make([]uint64, len(s.FieldsInv))
  490. fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
  491. fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
  492. buf := s.grabBuf(binary.MaxVarintLen64)
  493. tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
  494. locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
  495. fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false)
  496. var docTermMap [][]byte
  497. if s.builder == nil {
  498. s.builder, err = vellum.New(&s.builderBuf, nil)
  499. if err != nil {
  500. return 0, nil, err
  501. }
  502. }
  503. for fieldID, terms := range s.DictKeys {
  504. if cap(docTermMap) < len(s.results) {
  505. docTermMap = make([][]byte, len(s.results))
  506. } else {
  507. docTermMap = docTermMap[0:len(s.results)]
  508. for docNum := range docTermMap { // reset the docTermMap
  509. docTermMap[docNum] = docTermMap[docNum][:0]
  510. }
  511. }
  512. dict := s.Dicts[fieldID]
  513. for _, term := range terms { // terms are already sorted
  514. pid := dict[term] - 1
  515. postingsBS := s.Postings[pid]
  516. freqNorms := s.FreqNorms[pid]
  517. freqNormOffset := 0
  518. locs := s.Locs[pid]
  519. locOffset := 0
  520. postingsItr := postingsBS.Iterator()
  521. for postingsItr.HasNext() {
  522. docNum := uint64(postingsItr.Next())
  523. freqNorm := freqNorms[freqNormOffset]
  524. err = tfEncoder.Add(docNum,
  525. encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
  526. uint64(math.Float32bits(freqNorm.norm)))
  527. if err != nil {
  528. return 0, nil, err
  529. }
  530. if freqNorm.numLocs > 0 {
  531. numBytesLocs := 0
  532. for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
  533. numBytesLocs += totalUvarintBytes(
  534. uint64(loc.fieldID), loc.pos, loc.start, loc.end,
  535. uint64(len(loc.arrayposs)), loc.arrayposs)
  536. }
  537. err = locEncoder.Add(docNum, uint64(numBytesLocs))
  538. if err != nil {
  539. return 0, nil, err
  540. }
  541. for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
  542. err = locEncoder.Add(docNum,
  543. uint64(loc.fieldID), loc.pos, loc.start, loc.end,
  544. uint64(len(loc.arrayposs)))
  545. if err != nil {
  546. return 0, nil, err
  547. }
  548. err = locEncoder.Add(docNum, loc.arrayposs...)
  549. if err != nil {
  550. return 0, nil, err
  551. }
  552. }
  553. locOffset += freqNorm.numLocs
  554. }
  555. freqNormOffset++
  556. docTermMap[docNum] = append(
  557. append(docTermMap[docNum], term...),
  558. termSeparator)
  559. }
  560. tfEncoder.Close()
  561. locEncoder.Close()
  562. postingsOffset, err :=
  563. writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
  564. if err != nil {
  565. return 0, nil, err
  566. }
  567. if postingsOffset > uint64(0) {
  568. err = s.builder.Insert([]byte(term), postingsOffset)
  569. if err != nil {
  570. return 0, nil, err
  571. }
  572. }
  573. tfEncoder.Reset()
  574. locEncoder.Reset()
  575. }
  576. err = s.builder.Close()
  577. if err != nil {
  578. return 0, nil, err
  579. }
  580. // record where this dictionary starts
  581. dictOffsets[fieldID] = uint64(s.w.Count())
  582. vellumData := s.builderBuf.Bytes()
  583. // write out the length of the vellum data
  584. n := binary.PutUvarint(buf, uint64(len(vellumData)))
  585. _, err = s.w.Write(buf[:n])
  586. if err != nil {
  587. return 0, nil, err
  588. }
  589. // write this vellum to disk
  590. _, err = s.w.Write(vellumData)
  591. if err != nil {
  592. return 0, nil, err
  593. }
  594. // reset vellum for reuse
  595. s.builderBuf.Reset()
  596. err = s.builder.Reset(&s.builderBuf)
  597. if err != nil {
  598. return 0, nil, err
  599. }
  600. // write the field doc values
  601. if s.IncludeDocValues[fieldID] {
  602. for docNum, docTerms := range docTermMap {
  603. if len(docTerms) > 0 {
  604. err = fdvEncoder.Add(uint64(docNum), docTerms)
  605. if err != nil {
  606. return 0, nil, err
  607. }
  608. }
  609. }
  610. err = fdvEncoder.Close()
  611. if err != nil {
  612. return 0, nil, err
  613. }
  614. fdvOffsetsStart[fieldID] = uint64(s.w.Count())
  615. _, err = fdvEncoder.Write()
  616. if err != nil {
  617. return 0, nil, err
  618. }
  619. fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
  620. fdvEncoder.Reset()
  621. } else {
  622. fdvOffsetsStart[fieldID] = fieldNotUninverted
  623. fdvOffsetsEnd[fieldID] = fieldNotUninverted
  624. }
  625. }
  626. fdvIndexOffset = uint64(s.w.Count())
  627. for i := 0; i < len(fdvOffsetsStart); i++ {
  628. n := binary.PutUvarint(buf, fdvOffsetsStart[i])
  629. _, err := s.w.Write(buf[:n])
  630. if err != nil {
  631. return 0, nil, err
  632. }
  633. n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
  634. _, err = s.w.Write(buf[:n])
  635. if err != nil {
  636. return 0, nil, err
  637. }
  638. }
  639. return fdvIndexOffset, dictOffsets, nil
  640. }
  641. // returns the total # of bytes needed to encode the given uint64's
  642. // into binary.PutUVarint() encoding
  643. func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
  644. n = numUvarintBytes(a)
  645. n += numUvarintBytes(b)
  646. n += numUvarintBytes(c)
  647. n += numUvarintBytes(d)
  648. n += numUvarintBytes(e)
  649. for _, v := range more {
  650. n += numUvarintBytes(v)
  651. }
  652. return n
  653. }
  654. // returns # of bytes needed to encode x in binary.PutUvarint() encoding
  655. func numUvarintBytes(x uint64) (n int) {
  656. for x >= 0x80 {
  657. x >>= 7
  658. n++
  659. }
  660. return n + 1
  661. }