You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

new.go 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860
  1. // Copyright (c) 2018 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package zap
  15. import (
  16. "bytes"
  17. "encoding/binary"
  18. "math"
  19. "sort"
  20. "sync"
  21. "github.com/RoaringBitmap/roaring"
  22. "github.com/blevesearch/bleve/analysis"
  23. "github.com/blevesearch/bleve/document"
  24. "github.com/blevesearch/bleve/index"
  25. "github.com/blevesearch/bleve/index/scorch/segment"
  26. "github.com/couchbase/vellum"
  27. "github.com/golang/snappy"
  28. )
  29. var NewSegmentBufferNumResultsBump int = 100
  30. var NewSegmentBufferNumResultsFactor float64 = 1.0
  31. var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
  32. // ValidateDocFields can be set by applications to perform additional checks
  33. // on fields in a document being added to a new segment, by default it does
  34. // nothing.
  35. // This API is experimental and may be removed at any time.
  36. var ValidateDocFields = func(field document.Field) error {
  37. return nil
  38. }
  39. // AnalysisResultsToSegmentBase produces an in-memory zap-encoded
  40. // SegmentBase from analysis results
  41. func (z *ZapPlugin) New(results []*index.AnalysisResult) (
  42. segment.Segment, uint64, error) {
  43. return z.newWithChunkMode(results, DefaultChunkMode)
  44. }
  45. func (*ZapPlugin) newWithChunkMode(results []*index.AnalysisResult,
  46. chunkMode uint32) (segment.Segment, uint64, error) {
  47. s := interimPool.Get().(*interim)
  48. var br bytes.Buffer
  49. if s.lastNumDocs > 0 {
  50. // use previous results to initialize the buf with an estimate
  51. // size, but note that the interim instance comes from a
  52. // global interimPool, so multiple scorch instances indexing
  53. // different docs can lead to low quality estimates
  54. estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
  55. NewSegmentBufferNumResultsFactor)
  56. estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
  57. NewSegmentBufferAvgBytesPerDocFactor)
  58. br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
  59. }
  60. s.results = results
  61. s.chunkMode = chunkMode
  62. s.w = NewCountHashWriter(&br)
  63. storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
  64. err := s.convert()
  65. if err != nil {
  66. return nil, uint64(0), err
  67. }
  68. sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode,
  69. s.FieldsMap, s.FieldsInv, uint64(len(results)),
  70. storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
  71. if err == nil && s.reset() == nil {
  72. s.lastNumDocs = len(results)
  73. s.lastOutSize = len(br.Bytes())
  74. interimPool.Put(s)
  75. }
  76. return sb, uint64(len(br.Bytes())), err
  77. }
  78. var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
  79. // interim holds temporary working data used while converting from
  80. // analysis results to a zap-encoded segment
  81. type interim struct {
  82. results []*index.AnalysisResult
  83. chunkMode uint32
  84. w *CountHashWriter
  85. // FieldsMap adds 1 to field id to avoid zero value issues
  86. // name -> field id + 1
  87. FieldsMap map[string]uint16
  88. // FieldsInv is the inverse of FieldsMap
  89. // field id -> name
  90. FieldsInv []string
  91. // Term dictionaries for each field
  92. // field id -> term -> postings list id + 1
  93. Dicts []map[string]uint64
  94. // Terms for each field, where terms are sorted ascending
  95. // field id -> []term
  96. DictKeys [][]string
  97. // Fields whose IncludeDocValues is true
  98. // field id -> bool
  99. IncludeDocValues []bool
  100. // postings id -> bitmap of docNums
  101. Postings []*roaring.Bitmap
  102. // postings id -> freq/norm's, one for each docNum in postings
  103. FreqNorms [][]interimFreqNorm
  104. freqNormsBacking []interimFreqNorm
  105. // postings id -> locs, one for each freq
  106. Locs [][]interimLoc
  107. locsBacking []interimLoc
  108. numTermsPerPostingsList []int // key is postings list id
  109. numLocsPerPostingsList []int // key is postings list id
  110. builder *vellum.Builder
  111. builderBuf bytes.Buffer
  112. metaBuf bytes.Buffer
  113. tmp0 []byte
  114. tmp1 []byte
  115. lastNumDocs int
  116. lastOutSize int
  117. }
  118. func (s *interim) reset() (err error) {
  119. s.results = nil
  120. s.chunkMode = 0
  121. s.w = nil
  122. s.FieldsMap = nil
  123. s.FieldsInv = nil
  124. for i := range s.Dicts {
  125. s.Dicts[i] = nil
  126. }
  127. s.Dicts = s.Dicts[:0]
  128. for i := range s.DictKeys {
  129. s.DictKeys[i] = s.DictKeys[i][:0]
  130. }
  131. s.DictKeys = s.DictKeys[:0]
  132. for i := range s.IncludeDocValues {
  133. s.IncludeDocValues[i] = false
  134. }
  135. s.IncludeDocValues = s.IncludeDocValues[:0]
  136. for _, idn := range s.Postings {
  137. idn.Clear()
  138. }
  139. s.Postings = s.Postings[:0]
  140. s.FreqNorms = s.FreqNorms[:0]
  141. for i := range s.freqNormsBacking {
  142. s.freqNormsBacking[i] = interimFreqNorm{}
  143. }
  144. s.freqNormsBacking = s.freqNormsBacking[:0]
  145. s.Locs = s.Locs[:0]
  146. for i := range s.locsBacking {
  147. s.locsBacking[i] = interimLoc{}
  148. }
  149. s.locsBacking = s.locsBacking[:0]
  150. s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
  151. s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
  152. s.builderBuf.Reset()
  153. if s.builder != nil {
  154. err = s.builder.Reset(&s.builderBuf)
  155. }
  156. s.metaBuf.Reset()
  157. s.tmp0 = s.tmp0[:0]
  158. s.tmp1 = s.tmp1[:0]
  159. s.lastNumDocs = 0
  160. s.lastOutSize = 0
  161. return err
  162. }
  163. func (s *interim) grabBuf(size int) []byte {
  164. buf := s.tmp0
  165. if cap(buf) < size {
  166. buf = make([]byte, size)
  167. s.tmp0 = buf
  168. }
  169. return buf[0:size]
  170. }
  171. type interimStoredField struct {
  172. vals [][]byte
  173. typs []byte
  174. arrayposs [][]uint64 // array positions
  175. }
  176. type interimFreqNorm struct {
  177. freq uint64
  178. norm float32
  179. numLocs int
  180. }
  181. type interimLoc struct {
  182. fieldID uint16
  183. pos uint64
  184. start uint64
  185. end uint64
  186. arrayposs []uint64
  187. }
  188. func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
  189. s.FieldsMap = map[string]uint16{}
  190. s.getOrDefineField("_id") // _id field is fieldID 0
  191. for _, result := range s.results {
  192. for _, field := range result.Document.CompositeFields {
  193. s.getOrDefineField(field.Name())
  194. }
  195. for _, field := range result.Document.Fields {
  196. s.getOrDefineField(field.Name())
  197. }
  198. }
  199. sort.Strings(s.FieldsInv[1:]) // keep _id as first field
  200. for fieldID, fieldName := range s.FieldsInv {
  201. s.FieldsMap[fieldName] = uint16(fieldID + 1)
  202. }
  203. if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
  204. s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
  205. } else {
  206. s.IncludeDocValues = make([]bool, len(s.FieldsInv))
  207. }
  208. s.prepareDicts()
  209. for _, dict := range s.DictKeys {
  210. sort.Strings(dict)
  211. }
  212. s.processDocuments()
  213. storedIndexOffset, err := s.writeStoredFields()
  214. if err != nil {
  215. return 0, 0, 0, nil, err
  216. }
  217. var fdvIndexOffset uint64
  218. var dictOffsets []uint64
  219. if len(s.results) > 0 {
  220. fdvIndexOffset, dictOffsets, err = s.writeDicts()
  221. if err != nil {
  222. return 0, 0, 0, nil, err
  223. }
  224. } else {
  225. dictOffsets = make([]uint64, len(s.FieldsInv))
  226. }
  227. fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
  228. if err != nil {
  229. return 0, 0, 0, nil, err
  230. }
  231. return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
  232. }
  233. func (s *interim) getOrDefineField(fieldName string) int {
  234. fieldIDPlus1, exists := s.FieldsMap[fieldName]
  235. if !exists {
  236. fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
  237. s.FieldsMap[fieldName] = fieldIDPlus1
  238. s.FieldsInv = append(s.FieldsInv, fieldName)
  239. s.Dicts = append(s.Dicts, make(map[string]uint64))
  240. n := len(s.DictKeys)
  241. if n < cap(s.DictKeys) {
  242. s.DictKeys = s.DictKeys[:n+1]
  243. s.DictKeys[n] = s.DictKeys[n][:0]
  244. } else {
  245. s.DictKeys = append(s.DictKeys, []string(nil))
  246. }
  247. }
  248. return int(fieldIDPlus1 - 1)
  249. }
  250. // fill Dicts and DictKeys from analysis results
  251. func (s *interim) prepareDicts() {
  252. var pidNext int
  253. var totTFs int
  254. var totLocs int
  255. visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
  256. dict := s.Dicts[fieldID]
  257. dictKeys := s.DictKeys[fieldID]
  258. for term, tf := range tfs {
  259. pidPlus1, exists := dict[term]
  260. if !exists {
  261. pidNext++
  262. pidPlus1 = uint64(pidNext)
  263. dict[term] = pidPlus1
  264. dictKeys = append(dictKeys, term)
  265. s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
  266. s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
  267. }
  268. pid := pidPlus1 - 1
  269. s.numTermsPerPostingsList[pid] += 1
  270. s.numLocsPerPostingsList[pid] += len(tf.Locations)
  271. totLocs += len(tf.Locations)
  272. }
  273. totTFs += len(tfs)
  274. s.DictKeys[fieldID] = dictKeys
  275. }
  276. for _, result := range s.results {
  277. // walk each composite field
  278. for _, field := range result.Document.CompositeFields {
  279. fieldID := uint16(s.getOrDefineField(field.Name()))
  280. _, tf := field.Analyze()
  281. visitField(fieldID, tf)
  282. }
  283. // walk each field
  284. for i, field := range result.Document.Fields {
  285. fieldID := uint16(s.getOrDefineField(field.Name()))
  286. tf := result.Analyzed[i]
  287. visitField(fieldID, tf)
  288. }
  289. }
  290. numPostingsLists := pidNext
  291. if cap(s.Postings) >= numPostingsLists {
  292. s.Postings = s.Postings[:numPostingsLists]
  293. } else {
  294. postings := make([]*roaring.Bitmap, numPostingsLists)
  295. copy(postings, s.Postings[:cap(s.Postings)])
  296. for i := 0; i < numPostingsLists; i++ {
  297. if postings[i] == nil {
  298. postings[i] = roaring.New()
  299. }
  300. }
  301. s.Postings = postings
  302. }
  303. if cap(s.FreqNorms) >= numPostingsLists {
  304. s.FreqNorms = s.FreqNorms[:numPostingsLists]
  305. } else {
  306. s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
  307. }
  308. if cap(s.freqNormsBacking) >= totTFs {
  309. s.freqNormsBacking = s.freqNormsBacking[:totTFs]
  310. } else {
  311. s.freqNormsBacking = make([]interimFreqNorm, totTFs)
  312. }
  313. freqNormsBacking := s.freqNormsBacking
  314. for pid, numTerms := range s.numTermsPerPostingsList {
  315. s.FreqNorms[pid] = freqNormsBacking[0:0]
  316. freqNormsBacking = freqNormsBacking[numTerms:]
  317. }
  318. if cap(s.Locs) >= numPostingsLists {
  319. s.Locs = s.Locs[:numPostingsLists]
  320. } else {
  321. s.Locs = make([][]interimLoc, numPostingsLists)
  322. }
  323. if cap(s.locsBacking) >= totLocs {
  324. s.locsBacking = s.locsBacking[:totLocs]
  325. } else {
  326. s.locsBacking = make([]interimLoc, totLocs)
  327. }
  328. locsBacking := s.locsBacking
  329. for pid, numLocs := range s.numLocsPerPostingsList {
  330. s.Locs[pid] = locsBacking[0:0]
  331. locsBacking = locsBacking[numLocs:]
  332. }
  333. }
  334. func (s *interim) processDocuments() {
  335. numFields := len(s.FieldsInv)
  336. reuseFieldLens := make([]int, numFields)
  337. reuseFieldTFs := make([]analysis.TokenFrequencies, numFields)
  338. for docNum, result := range s.results {
  339. for i := 0; i < numFields; i++ { // clear these for reuse
  340. reuseFieldLens[i] = 0
  341. reuseFieldTFs[i] = nil
  342. }
  343. s.processDocument(uint64(docNum), result,
  344. reuseFieldLens, reuseFieldTFs)
  345. }
  346. }
  347. func (s *interim) processDocument(docNum uint64,
  348. result *index.AnalysisResult,
  349. fieldLens []int, fieldTFs []analysis.TokenFrequencies) {
  350. visitField := func(fieldID uint16, fieldName string,
  351. ln int, tf analysis.TokenFrequencies) {
  352. fieldLens[fieldID] += ln
  353. existingFreqs := fieldTFs[fieldID]
  354. if existingFreqs != nil {
  355. existingFreqs.MergeAll(fieldName, tf)
  356. } else {
  357. fieldTFs[fieldID] = tf
  358. }
  359. }
  360. // walk each composite field
  361. for _, field := range result.Document.CompositeFields {
  362. fieldID := uint16(s.getOrDefineField(field.Name()))
  363. ln, tf := field.Analyze()
  364. visitField(fieldID, field.Name(), ln, tf)
  365. }
  366. // walk each field
  367. for i, field := range result.Document.Fields {
  368. fieldID := uint16(s.getOrDefineField(field.Name()))
  369. ln := result.Length[i]
  370. tf := result.Analyzed[i]
  371. visitField(fieldID, field.Name(), ln, tf)
  372. }
  373. // now that it's been rolled up into fieldTFs, walk that
  374. for fieldID, tfs := range fieldTFs {
  375. dict := s.Dicts[fieldID]
  376. norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))
  377. for term, tf := range tfs {
  378. pid := dict[term] - 1
  379. bs := s.Postings[pid]
  380. bs.Add(uint32(docNum))
  381. s.FreqNorms[pid] = append(s.FreqNorms[pid],
  382. interimFreqNorm{
  383. freq: uint64(tf.Frequency()),
  384. norm: norm,
  385. numLocs: len(tf.Locations),
  386. })
  387. if len(tf.Locations) > 0 {
  388. locs := s.Locs[pid]
  389. for _, loc := range tf.Locations {
  390. var locf = uint16(fieldID)
  391. if loc.Field != "" {
  392. locf = uint16(s.getOrDefineField(loc.Field))
  393. }
  394. var arrayposs []uint64
  395. if len(loc.ArrayPositions) > 0 {
  396. arrayposs = loc.ArrayPositions
  397. }
  398. locs = append(locs, interimLoc{
  399. fieldID: locf,
  400. pos: uint64(loc.Position),
  401. start: uint64(loc.Start),
  402. end: uint64(loc.End),
  403. arrayposs: arrayposs,
  404. })
  405. }
  406. s.Locs[pid] = locs
  407. }
  408. }
  409. }
  410. }
  411. func (s *interim) writeStoredFields() (
  412. storedIndexOffset uint64, err error) {
  413. varBuf := make([]byte, binary.MaxVarintLen64)
  414. metaEncode := func(val uint64) (int, error) {
  415. wb := binary.PutUvarint(varBuf, val)
  416. return s.metaBuf.Write(varBuf[:wb])
  417. }
  418. data, compressed := s.tmp0[:0], s.tmp1[:0]
  419. defer func() { s.tmp0, s.tmp1 = data, compressed }()
  420. // keyed by docNum
  421. docStoredOffsets := make([]uint64, len(s.results))
  422. // keyed by fieldID, for the current doc in the loop
  423. docStoredFields := map[uint16]interimStoredField{}
  424. for docNum, result := range s.results {
  425. for fieldID := range docStoredFields { // reset for next doc
  426. delete(docStoredFields, fieldID)
  427. }
  428. for _, field := range result.Document.Fields {
  429. fieldID := uint16(s.getOrDefineField(field.Name()))
  430. opts := field.Options()
  431. if opts.IsStored() {
  432. isf := docStoredFields[fieldID]
  433. isf.vals = append(isf.vals, field.Value())
  434. isf.typs = append(isf.typs, encodeFieldType(field))
  435. isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
  436. docStoredFields[fieldID] = isf
  437. }
  438. if opts.IncludeDocValues() {
  439. s.IncludeDocValues[fieldID] = true
  440. }
  441. err := ValidateDocFields(field)
  442. if err != nil {
  443. return 0, err
  444. }
  445. }
  446. var curr int
  447. s.metaBuf.Reset()
  448. data = data[:0]
  449. // _id field special case optimizes ExternalID() lookups
  450. idFieldVal := docStoredFields[uint16(0)].vals[0]
  451. _, err = metaEncode(uint64(len(idFieldVal)))
  452. if err != nil {
  453. return 0, err
  454. }
  455. // handle non-"_id" fields
  456. for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
  457. isf, exists := docStoredFields[uint16(fieldID)]
  458. if exists {
  459. curr, data, err = persistStoredFieldValues(
  460. fieldID, isf.vals, isf.typs, isf.arrayposs,
  461. curr, metaEncode, data)
  462. if err != nil {
  463. return 0, err
  464. }
  465. }
  466. }
  467. metaBytes := s.metaBuf.Bytes()
  468. compressed = snappy.Encode(compressed[:cap(compressed)], data)
  469. docStoredOffsets[docNum] = uint64(s.w.Count())
  470. _, err := writeUvarints(s.w,
  471. uint64(len(metaBytes)),
  472. uint64(len(idFieldVal)+len(compressed)))
  473. if err != nil {
  474. return 0, err
  475. }
  476. _, err = s.w.Write(metaBytes)
  477. if err != nil {
  478. return 0, err
  479. }
  480. _, err = s.w.Write(idFieldVal)
  481. if err != nil {
  482. return 0, err
  483. }
  484. _, err = s.w.Write(compressed)
  485. if err != nil {
  486. return 0, err
  487. }
  488. }
  489. storedIndexOffset = uint64(s.w.Count())
  490. for _, docStoredOffset := range docStoredOffsets {
  491. err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
  492. if err != nil {
  493. return 0, err
  494. }
  495. }
  496. return storedIndexOffset, nil
  497. }
  498. func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
  499. dictOffsets = make([]uint64, len(s.FieldsInv))
  500. fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
  501. fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
  502. buf := s.grabBuf(binary.MaxVarintLen64)
  503. // these int coders are initialized with chunk size 1024
  504. // however this will be reset to the correct chunk size
  505. // while processing each individual field-term section
  506. tfEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1))
  507. locEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1))
  508. var docTermMap [][]byte
  509. if s.builder == nil {
  510. s.builder, err = vellum.New(&s.builderBuf, nil)
  511. if err != nil {
  512. return 0, nil, err
  513. }
  514. }
  515. for fieldID, terms := range s.DictKeys {
  516. if cap(docTermMap) < len(s.results) {
  517. docTermMap = make([][]byte, len(s.results))
  518. } else {
  519. docTermMap = docTermMap[0:len(s.results)]
  520. for docNum := range docTermMap { // reset the docTermMap
  521. docTermMap[docNum] = docTermMap[docNum][:0]
  522. }
  523. }
  524. dict := s.Dicts[fieldID]
  525. for _, term := range terms { // terms are already sorted
  526. pid := dict[term] - 1
  527. postingsBS := s.Postings[pid]
  528. freqNorms := s.FreqNorms[pid]
  529. freqNormOffset := 0
  530. locs := s.Locs[pid]
  531. locOffset := 0
  532. chunkSize, err := getChunkSize(s.chunkMode, postingsBS.GetCardinality(), uint64(len(s.results)))
  533. if err != nil {
  534. return 0, nil, err
  535. }
  536. tfEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1))
  537. locEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1))
  538. postingsItr := postingsBS.Iterator()
  539. for postingsItr.HasNext() {
  540. docNum := uint64(postingsItr.Next())
  541. freqNorm := freqNorms[freqNormOffset]
  542. err = tfEncoder.Add(docNum,
  543. encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
  544. uint64(math.Float32bits(freqNorm.norm)))
  545. if err != nil {
  546. return 0, nil, err
  547. }
  548. if freqNorm.numLocs > 0 {
  549. numBytesLocs := 0
  550. for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
  551. numBytesLocs += totalUvarintBytes(
  552. uint64(loc.fieldID), loc.pos, loc.start, loc.end,
  553. uint64(len(loc.arrayposs)), loc.arrayposs)
  554. }
  555. err = locEncoder.Add(docNum, uint64(numBytesLocs))
  556. if err != nil {
  557. return 0, nil, err
  558. }
  559. for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
  560. err = locEncoder.Add(docNum,
  561. uint64(loc.fieldID), loc.pos, loc.start, loc.end,
  562. uint64(len(loc.arrayposs)))
  563. if err != nil {
  564. return 0, nil, err
  565. }
  566. err = locEncoder.Add(docNum, loc.arrayposs...)
  567. if err != nil {
  568. return 0, nil, err
  569. }
  570. }
  571. locOffset += freqNorm.numLocs
  572. }
  573. freqNormOffset++
  574. docTermMap[docNum] = append(
  575. append(docTermMap[docNum], term...),
  576. termSeparator)
  577. }
  578. tfEncoder.Close()
  579. locEncoder.Close()
  580. postingsOffset, err :=
  581. writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
  582. if err != nil {
  583. return 0, nil, err
  584. }
  585. if postingsOffset > uint64(0) {
  586. err = s.builder.Insert([]byte(term), postingsOffset)
  587. if err != nil {
  588. return 0, nil, err
  589. }
  590. }
  591. tfEncoder.Reset()
  592. locEncoder.Reset()
  593. }
  594. err = s.builder.Close()
  595. if err != nil {
  596. return 0, nil, err
  597. }
  598. // record where this dictionary starts
  599. dictOffsets[fieldID] = uint64(s.w.Count())
  600. vellumData := s.builderBuf.Bytes()
  601. // write out the length of the vellum data
  602. n := binary.PutUvarint(buf, uint64(len(vellumData)))
  603. _, err = s.w.Write(buf[:n])
  604. if err != nil {
  605. return 0, nil, err
  606. }
  607. // write this vellum to disk
  608. _, err = s.w.Write(vellumData)
  609. if err != nil {
  610. return 0, nil, err
  611. }
  612. // reset vellum for reuse
  613. s.builderBuf.Reset()
  614. err = s.builder.Reset(&s.builderBuf)
  615. if err != nil {
  616. return 0, nil, err
  617. }
  618. // write the field doc values
  619. // NOTE: doc values continue to use legacy chunk mode
  620. chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0)
  621. if err != nil {
  622. return 0, nil, err
  623. }
  624. fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(s.results)-1), s.w, false)
  625. if s.IncludeDocValues[fieldID] {
  626. for docNum, docTerms := range docTermMap {
  627. if len(docTerms) > 0 {
  628. err = fdvEncoder.Add(uint64(docNum), docTerms)
  629. if err != nil {
  630. return 0, nil, err
  631. }
  632. }
  633. }
  634. err = fdvEncoder.Close()
  635. if err != nil {
  636. return 0, nil, err
  637. }
  638. fdvOffsetsStart[fieldID] = uint64(s.w.Count())
  639. _, err = fdvEncoder.Write()
  640. if err != nil {
  641. return 0, nil, err
  642. }
  643. fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
  644. fdvEncoder.Reset()
  645. } else {
  646. fdvOffsetsStart[fieldID] = fieldNotUninverted
  647. fdvOffsetsEnd[fieldID] = fieldNotUninverted
  648. }
  649. }
  650. fdvIndexOffset = uint64(s.w.Count())
  651. for i := 0; i < len(fdvOffsetsStart); i++ {
  652. n := binary.PutUvarint(buf, fdvOffsetsStart[i])
  653. _, err := s.w.Write(buf[:n])
  654. if err != nil {
  655. return 0, nil, err
  656. }
  657. n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
  658. _, err = s.w.Write(buf[:n])
  659. if err != nil {
  660. return 0, nil, err
  661. }
  662. }
  663. return fdvIndexOffset, dictOffsets, nil
  664. }
  665. func encodeFieldType(f document.Field) byte {
  666. fieldType := byte('x')
  667. switch f.(type) {
  668. case *document.TextField:
  669. fieldType = 't'
  670. case *document.NumericField:
  671. fieldType = 'n'
  672. case *document.DateTimeField:
  673. fieldType = 'd'
  674. case *document.BooleanField:
  675. fieldType = 'b'
  676. case *document.GeoPointField:
  677. fieldType = 'g'
  678. case *document.CompositeField:
  679. fieldType = 'c'
  680. }
  681. return fieldType
  682. }
  683. // returns the total # of bytes needed to encode the given uint64's
  684. // into binary.PutUVarint() encoding
  685. func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
  686. n = numUvarintBytes(a)
  687. n += numUvarintBytes(b)
  688. n += numUvarintBytes(c)
  689. n += numUvarintBytes(d)
  690. n += numUvarintBytes(e)
  691. for _, v := range more {
  692. n += numUvarintBytes(v)
  693. }
  694. return n
  695. }
  696. // returns # of bytes needed to encode x in binary.PutUvarint() encoding
  697. func numUvarintBytes(x uint64) (n int) {
  698. for x >= 0x80 {
  699. x >>= 7
  700. n++
  701. }
  702. return n + 1
  703. }