You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

new.go 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847
  1. // Copyright (c) 2018 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package zap
  15. import (
  16. "bytes"
  17. "encoding/binary"
  18. "math"
  19. "sort"
  20. "sync"
  21. "github.com/RoaringBitmap/roaring"
  22. "github.com/blevesearch/bleve/analysis"
  23. "github.com/blevesearch/bleve/document"
  24. "github.com/blevesearch/bleve/index"
  25. "github.com/blevesearch/bleve/index/scorch/segment"
  26. "github.com/couchbase/vellum"
  27. "github.com/golang/snappy"
  28. )
  29. var NewSegmentBufferNumResultsBump int = 100
  30. var NewSegmentBufferNumResultsFactor float64 = 1.0
  31. var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
  32. // ValidateDocFields can be set by applications to perform additional checks
  33. // on fields in a document being added to a new segment, by default it does
  34. // nothing.
  35. // This API is experimental and may be removed at any time.
  36. var ValidateDocFields = func(field document.Field) error {
  37. return nil
  38. }
  39. var defaultChunkFactor uint32 = 1024
  40. // AnalysisResultsToSegmentBase produces an in-memory zap-encoded
  41. // SegmentBase from analysis results
  42. func (z *ZapPlugin) New(results []*index.AnalysisResult) (
  43. segment.Segment, uint64, error) {
  44. return z.newWithChunkFactor(results, defaultChunkFactor)
  45. }
  46. func (*ZapPlugin) newWithChunkFactor(results []*index.AnalysisResult,
  47. chunkFactor uint32) (segment.Segment, uint64, error) {
  48. s := interimPool.Get().(*interim)
  49. var br bytes.Buffer
  50. if s.lastNumDocs > 0 {
  51. // use previous results to initialize the buf with an estimate
  52. // size, but note that the interim instance comes from a
  53. // global interimPool, so multiple scorch instances indexing
  54. // different docs can lead to low quality estimates
  55. estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
  56. NewSegmentBufferNumResultsFactor)
  57. estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
  58. NewSegmentBufferAvgBytesPerDocFactor)
  59. br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
  60. }
  61. s.results = results
  62. s.chunkFactor = chunkFactor
  63. s.w = NewCountHashWriter(&br)
  64. storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
  65. err := s.convert()
  66. if err != nil {
  67. return nil, uint64(0), err
  68. }
  69. sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor,
  70. s.FieldsMap, s.FieldsInv, uint64(len(results)),
  71. storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
  72. if err == nil && s.reset() == nil {
  73. s.lastNumDocs = len(results)
  74. s.lastOutSize = len(br.Bytes())
  75. interimPool.Put(s)
  76. }
  77. return sb, uint64(len(br.Bytes())), err
  78. }
  79. var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
  80. // interim holds temporary working data used while converting from
  81. // analysis results to a zap-encoded segment
  82. type interim struct {
  83. results []*index.AnalysisResult
  84. chunkFactor uint32
  85. w *CountHashWriter
  86. // FieldsMap adds 1 to field id to avoid zero value issues
  87. // name -> field id + 1
  88. FieldsMap map[string]uint16
  89. // FieldsInv is the inverse of FieldsMap
  90. // field id -> name
  91. FieldsInv []string
  92. // Term dictionaries for each field
  93. // field id -> term -> postings list id + 1
  94. Dicts []map[string]uint64
  95. // Terms for each field, where terms are sorted ascending
  96. // field id -> []term
  97. DictKeys [][]string
  98. // Fields whose IncludeDocValues is true
  99. // field id -> bool
  100. IncludeDocValues []bool
  101. // postings id -> bitmap of docNums
  102. Postings []*roaring.Bitmap
  103. // postings id -> freq/norm's, one for each docNum in postings
  104. FreqNorms [][]interimFreqNorm
  105. freqNormsBacking []interimFreqNorm
  106. // postings id -> locs, one for each freq
  107. Locs [][]interimLoc
  108. locsBacking []interimLoc
  109. numTermsPerPostingsList []int // key is postings list id
  110. numLocsPerPostingsList []int // key is postings list id
  111. builder *vellum.Builder
  112. builderBuf bytes.Buffer
  113. metaBuf bytes.Buffer
  114. tmp0 []byte
  115. tmp1 []byte
  116. lastNumDocs int
  117. lastOutSize int
  118. }
  119. func (s *interim) reset() (err error) {
  120. s.results = nil
  121. s.chunkFactor = 0
  122. s.w = nil
  123. s.FieldsMap = nil
  124. s.FieldsInv = nil
  125. for i := range s.Dicts {
  126. s.Dicts[i] = nil
  127. }
  128. s.Dicts = s.Dicts[:0]
  129. for i := range s.DictKeys {
  130. s.DictKeys[i] = s.DictKeys[i][:0]
  131. }
  132. s.DictKeys = s.DictKeys[:0]
  133. for i := range s.IncludeDocValues {
  134. s.IncludeDocValues[i] = false
  135. }
  136. s.IncludeDocValues = s.IncludeDocValues[:0]
  137. for _, idn := range s.Postings {
  138. idn.Clear()
  139. }
  140. s.Postings = s.Postings[:0]
  141. s.FreqNorms = s.FreqNorms[:0]
  142. for i := range s.freqNormsBacking {
  143. s.freqNormsBacking[i] = interimFreqNorm{}
  144. }
  145. s.freqNormsBacking = s.freqNormsBacking[:0]
  146. s.Locs = s.Locs[:0]
  147. for i := range s.locsBacking {
  148. s.locsBacking[i] = interimLoc{}
  149. }
  150. s.locsBacking = s.locsBacking[:0]
  151. s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
  152. s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
  153. s.builderBuf.Reset()
  154. if s.builder != nil {
  155. err = s.builder.Reset(&s.builderBuf)
  156. }
  157. s.metaBuf.Reset()
  158. s.tmp0 = s.tmp0[:0]
  159. s.tmp1 = s.tmp1[:0]
  160. s.lastNumDocs = 0
  161. s.lastOutSize = 0
  162. return err
  163. }
  164. func (s *interim) grabBuf(size int) []byte {
  165. buf := s.tmp0
  166. if cap(buf) < size {
  167. buf = make([]byte, size)
  168. s.tmp0 = buf
  169. }
  170. return buf[0:size]
  171. }
  172. type interimStoredField struct {
  173. vals [][]byte
  174. typs []byte
  175. arrayposs [][]uint64 // array positions
  176. }
  177. type interimFreqNorm struct {
  178. freq uint64
  179. norm float32
  180. numLocs int
  181. }
  182. type interimLoc struct {
  183. fieldID uint16
  184. pos uint64
  185. start uint64
  186. end uint64
  187. arrayposs []uint64
  188. }
  189. func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
  190. s.FieldsMap = map[string]uint16{}
  191. s.getOrDefineField("_id") // _id field is fieldID 0
  192. for _, result := range s.results {
  193. for _, field := range result.Document.CompositeFields {
  194. s.getOrDefineField(field.Name())
  195. }
  196. for _, field := range result.Document.Fields {
  197. s.getOrDefineField(field.Name())
  198. }
  199. }
  200. sort.Strings(s.FieldsInv[1:]) // keep _id as first field
  201. for fieldID, fieldName := range s.FieldsInv {
  202. s.FieldsMap[fieldName] = uint16(fieldID + 1)
  203. }
  204. if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
  205. s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
  206. } else {
  207. s.IncludeDocValues = make([]bool, len(s.FieldsInv))
  208. }
  209. s.prepareDicts()
  210. for _, dict := range s.DictKeys {
  211. sort.Strings(dict)
  212. }
  213. s.processDocuments()
  214. storedIndexOffset, err := s.writeStoredFields()
  215. if err != nil {
  216. return 0, 0, 0, nil, err
  217. }
  218. var fdvIndexOffset uint64
  219. var dictOffsets []uint64
  220. if len(s.results) > 0 {
  221. fdvIndexOffset, dictOffsets, err = s.writeDicts()
  222. if err != nil {
  223. return 0, 0, 0, nil, err
  224. }
  225. } else {
  226. dictOffsets = make([]uint64, len(s.FieldsInv))
  227. }
  228. fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
  229. if err != nil {
  230. return 0, 0, 0, nil, err
  231. }
  232. return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
  233. }
  234. func (s *interim) getOrDefineField(fieldName string) int {
  235. fieldIDPlus1, exists := s.FieldsMap[fieldName]
  236. if !exists {
  237. fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
  238. s.FieldsMap[fieldName] = fieldIDPlus1
  239. s.FieldsInv = append(s.FieldsInv, fieldName)
  240. s.Dicts = append(s.Dicts, make(map[string]uint64))
  241. n := len(s.DictKeys)
  242. if n < cap(s.DictKeys) {
  243. s.DictKeys = s.DictKeys[:n+1]
  244. s.DictKeys[n] = s.DictKeys[n][:0]
  245. } else {
  246. s.DictKeys = append(s.DictKeys, []string(nil))
  247. }
  248. }
  249. return int(fieldIDPlus1 - 1)
  250. }
  251. // fill Dicts and DictKeys from analysis results
  252. func (s *interim) prepareDicts() {
  253. var pidNext int
  254. var totTFs int
  255. var totLocs int
  256. visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
  257. dict := s.Dicts[fieldID]
  258. dictKeys := s.DictKeys[fieldID]
  259. for term, tf := range tfs {
  260. pidPlus1, exists := dict[term]
  261. if !exists {
  262. pidNext++
  263. pidPlus1 = uint64(pidNext)
  264. dict[term] = pidPlus1
  265. dictKeys = append(dictKeys, term)
  266. s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
  267. s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
  268. }
  269. pid := pidPlus1 - 1
  270. s.numTermsPerPostingsList[pid] += 1
  271. s.numLocsPerPostingsList[pid] += len(tf.Locations)
  272. totLocs += len(tf.Locations)
  273. }
  274. totTFs += len(tfs)
  275. s.DictKeys[fieldID] = dictKeys
  276. }
  277. for _, result := range s.results {
  278. // walk each composite field
  279. for _, field := range result.Document.CompositeFields {
  280. fieldID := uint16(s.getOrDefineField(field.Name()))
  281. _, tf := field.Analyze()
  282. visitField(fieldID, tf)
  283. }
  284. // walk each field
  285. for i, field := range result.Document.Fields {
  286. fieldID := uint16(s.getOrDefineField(field.Name()))
  287. tf := result.Analyzed[i]
  288. visitField(fieldID, tf)
  289. }
  290. }
  291. numPostingsLists := pidNext
  292. if cap(s.Postings) >= numPostingsLists {
  293. s.Postings = s.Postings[:numPostingsLists]
  294. } else {
  295. postings := make([]*roaring.Bitmap, numPostingsLists)
  296. copy(postings, s.Postings[:cap(s.Postings)])
  297. for i := 0; i < numPostingsLists; i++ {
  298. if postings[i] == nil {
  299. postings[i] = roaring.New()
  300. }
  301. }
  302. s.Postings = postings
  303. }
  304. if cap(s.FreqNorms) >= numPostingsLists {
  305. s.FreqNorms = s.FreqNorms[:numPostingsLists]
  306. } else {
  307. s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
  308. }
  309. if cap(s.freqNormsBacking) >= totTFs {
  310. s.freqNormsBacking = s.freqNormsBacking[:totTFs]
  311. } else {
  312. s.freqNormsBacking = make([]interimFreqNorm, totTFs)
  313. }
  314. freqNormsBacking := s.freqNormsBacking
  315. for pid, numTerms := range s.numTermsPerPostingsList {
  316. s.FreqNorms[pid] = freqNormsBacking[0:0]
  317. freqNormsBacking = freqNormsBacking[numTerms:]
  318. }
  319. if cap(s.Locs) >= numPostingsLists {
  320. s.Locs = s.Locs[:numPostingsLists]
  321. } else {
  322. s.Locs = make([][]interimLoc, numPostingsLists)
  323. }
  324. if cap(s.locsBacking) >= totLocs {
  325. s.locsBacking = s.locsBacking[:totLocs]
  326. } else {
  327. s.locsBacking = make([]interimLoc, totLocs)
  328. }
  329. locsBacking := s.locsBacking
  330. for pid, numLocs := range s.numLocsPerPostingsList {
  331. s.Locs[pid] = locsBacking[0:0]
  332. locsBacking = locsBacking[numLocs:]
  333. }
  334. }
  335. func (s *interim) processDocuments() {
  336. numFields := len(s.FieldsInv)
  337. reuseFieldLens := make([]int, numFields)
  338. reuseFieldTFs := make([]analysis.TokenFrequencies, numFields)
  339. for docNum, result := range s.results {
  340. for i := 0; i < numFields; i++ { // clear these for reuse
  341. reuseFieldLens[i] = 0
  342. reuseFieldTFs[i] = nil
  343. }
  344. s.processDocument(uint64(docNum), result,
  345. reuseFieldLens, reuseFieldTFs)
  346. }
  347. }
  348. func (s *interim) processDocument(docNum uint64,
  349. result *index.AnalysisResult,
  350. fieldLens []int, fieldTFs []analysis.TokenFrequencies) {
  351. visitField := func(fieldID uint16, fieldName string,
  352. ln int, tf analysis.TokenFrequencies) {
  353. fieldLens[fieldID] += ln
  354. existingFreqs := fieldTFs[fieldID]
  355. if existingFreqs != nil {
  356. existingFreqs.MergeAll(fieldName, tf)
  357. } else {
  358. fieldTFs[fieldID] = tf
  359. }
  360. }
  361. // walk each composite field
  362. for _, field := range result.Document.CompositeFields {
  363. fieldID := uint16(s.getOrDefineField(field.Name()))
  364. ln, tf := field.Analyze()
  365. visitField(fieldID, field.Name(), ln, tf)
  366. }
  367. // walk each field
  368. for i, field := range result.Document.Fields {
  369. fieldID := uint16(s.getOrDefineField(field.Name()))
  370. ln := result.Length[i]
  371. tf := result.Analyzed[i]
  372. visitField(fieldID, field.Name(), ln, tf)
  373. }
  374. // now that it's been rolled up into fieldTFs, walk that
  375. for fieldID, tfs := range fieldTFs {
  376. dict := s.Dicts[fieldID]
  377. norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))
  378. for term, tf := range tfs {
  379. pid := dict[term] - 1
  380. bs := s.Postings[pid]
  381. bs.Add(uint32(docNum))
  382. s.FreqNorms[pid] = append(s.FreqNorms[pid],
  383. interimFreqNorm{
  384. freq: uint64(tf.Frequency()),
  385. norm: norm,
  386. numLocs: len(tf.Locations),
  387. })
  388. if len(tf.Locations) > 0 {
  389. locs := s.Locs[pid]
  390. for _, loc := range tf.Locations {
  391. var locf = uint16(fieldID)
  392. if loc.Field != "" {
  393. locf = uint16(s.getOrDefineField(loc.Field))
  394. }
  395. var arrayposs []uint64
  396. if len(loc.ArrayPositions) > 0 {
  397. arrayposs = loc.ArrayPositions
  398. }
  399. locs = append(locs, interimLoc{
  400. fieldID: locf,
  401. pos: uint64(loc.Position),
  402. start: uint64(loc.Start),
  403. end: uint64(loc.End),
  404. arrayposs: arrayposs,
  405. })
  406. }
  407. s.Locs[pid] = locs
  408. }
  409. }
  410. }
  411. }
  412. func (s *interim) writeStoredFields() (
  413. storedIndexOffset uint64, err error) {
  414. varBuf := make([]byte, binary.MaxVarintLen64)
  415. metaEncode := func(val uint64) (int, error) {
  416. wb := binary.PutUvarint(varBuf, val)
  417. return s.metaBuf.Write(varBuf[:wb])
  418. }
  419. data, compressed := s.tmp0[:0], s.tmp1[:0]
  420. defer func() { s.tmp0, s.tmp1 = data, compressed }()
  421. // keyed by docNum
  422. docStoredOffsets := make([]uint64, len(s.results))
  423. // keyed by fieldID, for the current doc in the loop
  424. docStoredFields := map[uint16]interimStoredField{}
  425. for docNum, result := range s.results {
  426. for fieldID := range docStoredFields { // reset for next doc
  427. delete(docStoredFields, fieldID)
  428. }
  429. for _, field := range result.Document.Fields {
  430. fieldID := uint16(s.getOrDefineField(field.Name()))
  431. opts := field.Options()
  432. if opts.IsStored() {
  433. isf := docStoredFields[fieldID]
  434. isf.vals = append(isf.vals, field.Value())
  435. isf.typs = append(isf.typs, encodeFieldType(field))
  436. isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
  437. docStoredFields[fieldID] = isf
  438. }
  439. if opts.IncludeDocValues() {
  440. s.IncludeDocValues[fieldID] = true
  441. }
  442. err := ValidateDocFields(field)
  443. if err != nil {
  444. return 0, err
  445. }
  446. }
  447. var curr int
  448. s.metaBuf.Reset()
  449. data = data[:0]
  450. // _id field special case optimizes ExternalID() lookups
  451. idFieldVal := docStoredFields[uint16(0)].vals[0]
  452. _, err = metaEncode(uint64(len(idFieldVal)))
  453. if err != nil {
  454. return 0, err
  455. }
  456. // handle non-"_id" fields
  457. for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
  458. isf, exists := docStoredFields[uint16(fieldID)]
  459. if exists {
  460. curr, data, err = persistStoredFieldValues(
  461. fieldID, isf.vals, isf.typs, isf.arrayposs,
  462. curr, metaEncode, data)
  463. if err != nil {
  464. return 0, err
  465. }
  466. }
  467. }
  468. metaBytes := s.metaBuf.Bytes()
  469. compressed = snappy.Encode(compressed[:cap(compressed)], data)
  470. docStoredOffsets[docNum] = uint64(s.w.Count())
  471. _, err := writeUvarints(s.w,
  472. uint64(len(metaBytes)),
  473. uint64(len(idFieldVal)+len(compressed)))
  474. if err != nil {
  475. return 0, err
  476. }
  477. _, err = s.w.Write(metaBytes)
  478. if err != nil {
  479. return 0, err
  480. }
  481. _, err = s.w.Write(idFieldVal)
  482. if err != nil {
  483. return 0, err
  484. }
  485. _, err = s.w.Write(compressed)
  486. if err != nil {
  487. return 0, err
  488. }
  489. }
  490. storedIndexOffset = uint64(s.w.Count())
  491. for _, docStoredOffset := range docStoredOffsets {
  492. err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
  493. if err != nil {
  494. return 0, err
  495. }
  496. }
  497. return storedIndexOffset, nil
  498. }
  499. func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
  500. dictOffsets = make([]uint64, len(s.FieldsInv))
  501. fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
  502. fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
  503. buf := s.grabBuf(binary.MaxVarintLen64)
  504. tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
  505. locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
  506. fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false)
  507. var docTermMap [][]byte
  508. if s.builder == nil {
  509. s.builder, err = vellum.New(&s.builderBuf, nil)
  510. if err != nil {
  511. return 0, nil, err
  512. }
  513. }
  514. for fieldID, terms := range s.DictKeys {
  515. if cap(docTermMap) < len(s.results) {
  516. docTermMap = make([][]byte, len(s.results))
  517. } else {
  518. docTermMap = docTermMap[0:len(s.results)]
  519. for docNum := range docTermMap { // reset the docTermMap
  520. docTermMap[docNum] = docTermMap[docNum][:0]
  521. }
  522. }
  523. dict := s.Dicts[fieldID]
  524. for _, term := range terms { // terms are already sorted
  525. pid := dict[term] - 1
  526. postingsBS := s.Postings[pid]
  527. freqNorms := s.FreqNorms[pid]
  528. freqNormOffset := 0
  529. locs := s.Locs[pid]
  530. locOffset := 0
  531. postingsItr := postingsBS.Iterator()
  532. for postingsItr.HasNext() {
  533. docNum := uint64(postingsItr.Next())
  534. freqNorm := freqNorms[freqNormOffset]
  535. err = tfEncoder.Add(docNum,
  536. encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
  537. uint64(math.Float32bits(freqNorm.norm)))
  538. if err != nil {
  539. return 0, nil, err
  540. }
  541. if freqNorm.numLocs > 0 {
  542. numBytesLocs := 0
  543. for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
  544. numBytesLocs += totalUvarintBytes(
  545. uint64(loc.fieldID), loc.pos, loc.start, loc.end,
  546. uint64(len(loc.arrayposs)), loc.arrayposs)
  547. }
  548. err = locEncoder.Add(docNum, uint64(numBytesLocs))
  549. if err != nil {
  550. return 0, nil, err
  551. }
  552. for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
  553. err = locEncoder.Add(docNum,
  554. uint64(loc.fieldID), loc.pos, loc.start, loc.end,
  555. uint64(len(loc.arrayposs)))
  556. if err != nil {
  557. return 0, nil, err
  558. }
  559. err = locEncoder.Add(docNum, loc.arrayposs...)
  560. if err != nil {
  561. return 0, nil, err
  562. }
  563. }
  564. locOffset += freqNorm.numLocs
  565. }
  566. freqNormOffset++
  567. docTermMap[docNum] = append(
  568. append(docTermMap[docNum], term...),
  569. termSeparator)
  570. }
  571. tfEncoder.Close()
  572. locEncoder.Close()
  573. postingsOffset, err :=
  574. writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
  575. if err != nil {
  576. return 0, nil, err
  577. }
  578. if postingsOffset > uint64(0) {
  579. err = s.builder.Insert([]byte(term), postingsOffset)
  580. if err != nil {
  581. return 0, nil, err
  582. }
  583. }
  584. tfEncoder.Reset()
  585. locEncoder.Reset()
  586. }
  587. err = s.builder.Close()
  588. if err != nil {
  589. return 0, nil, err
  590. }
  591. // record where this dictionary starts
  592. dictOffsets[fieldID] = uint64(s.w.Count())
  593. vellumData := s.builderBuf.Bytes()
  594. // write out the length of the vellum data
  595. n := binary.PutUvarint(buf, uint64(len(vellumData)))
  596. _, err = s.w.Write(buf[:n])
  597. if err != nil {
  598. return 0, nil, err
  599. }
  600. // write this vellum to disk
  601. _, err = s.w.Write(vellumData)
  602. if err != nil {
  603. return 0, nil, err
  604. }
  605. // reset vellum for reuse
  606. s.builderBuf.Reset()
  607. err = s.builder.Reset(&s.builderBuf)
  608. if err != nil {
  609. return 0, nil, err
  610. }
  611. // write the field doc values
  612. if s.IncludeDocValues[fieldID] {
  613. for docNum, docTerms := range docTermMap {
  614. if len(docTerms) > 0 {
  615. err = fdvEncoder.Add(uint64(docNum), docTerms)
  616. if err != nil {
  617. return 0, nil, err
  618. }
  619. }
  620. }
  621. err = fdvEncoder.Close()
  622. if err != nil {
  623. return 0, nil, err
  624. }
  625. fdvOffsetsStart[fieldID] = uint64(s.w.Count())
  626. _, err = fdvEncoder.Write()
  627. if err != nil {
  628. return 0, nil, err
  629. }
  630. fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
  631. fdvEncoder.Reset()
  632. } else {
  633. fdvOffsetsStart[fieldID] = fieldNotUninverted
  634. fdvOffsetsEnd[fieldID] = fieldNotUninverted
  635. }
  636. }
  637. fdvIndexOffset = uint64(s.w.Count())
  638. for i := 0; i < len(fdvOffsetsStart); i++ {
  639. n := binary.PutUvarint(buf, fdvOffsetsStart[i])
  640. _, err := s.w.Write(buf[:n])
  641. if err != nil {
  642. return 0, nil, err
  643. }
  644. n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
  645. _, err = s.w.Write(buf[:n])
  646. if err != nil {
  647. return 0, nil, err
  648. }
  649. }
  650. return fdvIndexOffset, dictOffsets, nil
  651. }
  652. func encodeFieldType(f document.Field) byte {
  653. fieldType := byte('x')
  654. switch f.(type) {
  655. case *document.TextField:
  656. fieldType = 't'
  657. case *document.NumericField:
  658. fieldType = 'n'
  659. case *document.DateTimeField:
  660. fieldType = 'd'
  661. case *document.BooleanField:
  662. fieldType = 'b'
  663. case *document.GeoPointField:
  664. fieldType = 'g'
  665. case *document.CompositeField:
  666. fieldType = 'c'
  667. }
  668. return fieldType
  669. }
  670. // returns the total # of bytes needed to encode the given uint64's
  671. // into binary.PutUVarint() encoding
  672. func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
  673. n = numUvarintBytes(a)
  674. n += numUvarintBytes(b)
  675. n += numUvarintBytes(c)
  676. n += numUvarintBytes(d)
  677. n += numUvarintBytes(e)
  678. for _, v := range more {
  679. n += numUvarintBytes(v)
  680. }
  681. return n
  682. }
  683. // returns # of bytes needed to encode x in binary.PutUvarint() encoding
  684. func numUvarintBytes(x uint64) (n int) {
  685. for x >= 0x80 {
  686. x >>= 7
  687. n++
  688. }
  689. return n + 1
  690. }