You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

row.go 26KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112
  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package upsidedown
  15. import (
  16. "bytes"
  17. "encoding/binary"
  18. "fmt"
  19. "io"
  20. "math"
  21. "github.com/golang/protobuf/proto"
  22. )
  23. const ByteSeparator byte = 0xff
  24. type UpsideDownCouchRowStream chan UpsideDownCouchRow
  25. type UpsideDownCouchRow interface {
  26. KeySize() int
  27. KeyTo([]byte) (int, error)
  28. Key() []byte
  29. Value() []byte
  30. ValueSize() int
  31. ValueTo([]byte) (int, error)
  32. }
  33. func ParseFromKeyValue(key, value []byte) (UpsideDownCouchRow, error) {
  34. if len(key) > 0 {
  35. switch key[0] {
  36. case 'v':
  37. return NewVersionRowKV(key, value)
  38. case 'f':
  39. return NewFieldRowKV(key, value)
  40. case 'd':
  41. return NewDictionaryRowKV(key, value)
  42. case 't':
  43. return NewTermFrequencyRowKV(key, value)
  44. case 'b':
  45. return NewBackIndexRowKV(key, value)
  46. case 's':
  47. return NewStoredRowKV(key, value)
  48. case 'i':
  49. return NewInternalRowKV(key, value)
  50. }
  51. return nil, fmt.Errorf("Unknown field type '%s'", string(key[0]))
  52. }
  53. return nil, fmt.Errorf("Invalid empty key")
  54. }
  55. // VERSION
  56. type VersionRow struct {
  57. version uint8
  58. }
  59. func (v *VersionRow) Key() []byte {
  60. return []byte{'v'}
  61. }
  62. func (v *VersionRow) KeySize() int {
  63. return 1
  64. }
  65. func (v *VersionRow) KeyTo(buf []byte) (int, error) {
  66. buf[0] = 'v'
  67. return 1, nil
  68. }
  69. func (v *VersionRow) Value() []byte {
  70. return []byte{byte(v.version)}
  71. }
  72. func (v *VersionRow) ValueSize() int {
  73. return 1
  74. }
  75. func (v *VersionRow) ValueTo(buf []byte) (int, error) {
  76. buf[0] = v.version
  77. return 1, nil
  78. }
  79. func (v *VersionRow) String() string {
  80. return fmt.Sprintf("Version: %d", v.version)
  81. }
  82. func NewVersionRow(version uint8) *VersionRow {
  83. return &VersionRow{
  84. version: version,
  85. }
  86. }
  87. func NewVersionRowKV(key, value []byte) (*VersionRow, error) {
  88. rv := VersionRow{}
  89. buf := bytes.NewBuffer(value)
  90. err := binary.Read(buf, binary.LittleEndian, &rv.version)
  91. if err != nil {
  92. return nil, err
  93. }
  94. return &rv, nil
  95. }
  96. // INTERNAL STORAGE
  97. type InternalRow struct {
  98. key []byte
  99. val []byte
  100. }
  101. func (i *InternalRow) Key() []byte {
  102. buf := make([]byte, i.KeySize())
  103. size, _ := i.KeyTo(buf)
  104. return buf[:size]
  105. }
  106. func (i *InternalRow) KeySize() int {
  107. return len(i.key) + 1
  108. }
  109. func (i *InternalRow) KeyTo(buf []byte) (int, error) {
  110. buf[0] = 'i'
  111. actual := copy(buf[1:], i.key)
  112. return 1 + actual, nil
  113. }
  114. func (i *InternalRow) Value() []byte {
  115. return i.val
  116. }
  117. func (i *InternalRow) ValueSize() int {
  118. return len(i.val)
  119. }
  120. func (i *InternalRow) ValueTo(buf []byte) (int, error) {
  121. actual := copy(buf, i.val)
  122. return actual, nil
  123. }
  124. func (i *InternalRow) String() string {
  125. return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", i.key, i.key, i.val, i.val)
  126. }
  127. func NewInternalRow(key, val []byte) *InternalRow {
  128. return &InternalRow{
  129. key: key,
  130. val: val,
  131. }
  132. }
  133. func NewInternalRowKV(key, value []byte) (*InternalRow, error) {
  134. rv := InternalRow{}
  135. rv.key = key[1:]
  136. rv.val = value
  137. return &rv, nil
  138. }
  139. // FIELD definition
  140. type FieldRow struct {
  141. index uint16
  142. name string
  143. }
  144. func (f *FieldRow) Key() []byte {
  145. buf := make([]byte, f.KeySize())
  146. size, _ := f.KeyTo(buf)
  147. return buf[:size]
  148. }
  149. func (f *FieldRow) KeySize() int {
  150. return 3
  151. }
  152. func (f *FieldRow) KeyTo(buf []byte) (int, error) {
  153. buf[0] = 'f'
  154. binary.LittleEndian.PutUint16(buf[1:3], f.index)
  155. return 3, nil
  156. }
  157. func (f *FieldRow) Value() []byte {
  158. return append([]byte(f.name), ByteSeparator)
  159. }
  160. func (f *FieldRow) ValueSize() int {
  161. return len(f.name) + 1
  162. }
  163. func (f *FieldRow) ValueTo(buf []byte) (int, error) {
  164. size := copy(buf, f.name)
  165. buf[size] = ByteSeparator
  166. return size + 1, nil
  167. }
  168. func (f *FieldRow) String() string {
  169. return fmt.Sprintf("Field: %d Name: %s", f.index, f.name)
  170. }
  171. func NewFieldRow(index uint16, name string) *FieldRow {
  172. return &FieldRow{
  173. index: index,
  174. name: name,
  175. }
  176. }
  177. func NewFieldRowKV(key, value []byte) (*FieldRow, error) {
  178. rv := FieldRow{}
  179. buf := bytes.NewBuffer(key)
  180. _, err := buf.ReadByte() // type
  181. if err != nil {
  182. return nil, err
  183. }
  184. err = binary.Read(buf, binary.LittleEndian, &rv.index)
  185. if err != nil {
  186. return nil, err
  187. }
  188. buf = bytes.NewBuffer(value)
  189. rv.name, err = buf.ReadString(ByteSeparator)
  190. if err != nil {
  191. return nil, err
  192. }
  193. rv.name = rv.name[:len(rv.name)-1] // trim off separator byte
  194. return &rv, nil
  195. }
  196. // DICTIONARY
  197. const DictionaryRowMaxValueSize = binary.MaxVarintLen64
  198. type DictionaryRow struct {
  199. term []byte
  200. count uint64
  201. field uint16
  202. }
  203. func (dr *DictionaryRow) Key() []byte {
  204. buf := make([]byte, dr.KeySize())
  205. size, _ := dr.KeyTo(buf)
  206. return buf[:size]
  207. }
  208. func (dr *DictionaryRow) KeySize() int {
  209. return dictionaryRowKeySize(dr.term)
  210. }
  211. func dictionaryRowKeySize(term []byte) int {
  212. return len(term) + 3
  213. }
  214. func (dr *DictionaryRow) KeyTo(buf []byte) (int, error) {
  215. return dictionaryRowKeyTo(buf, dr.field, dr.term), nil
  216. }
  217. func dictionaryRowKeyTo(buf []byte, field uint16, term []byte) int {
  218. buf[0] = 'd'
  219. binary.LittleEndian.PutUint16(buf[1:3], field)
  220. size := copy(buf[3:], term)
  221. return size + 3
  222. }
  223. func (dr *DictionaryRow) Value() []byte {
  224. buf := make([]byte, dr.ValueSize())
  225. size, _ := dr.ValueTo(buf)
  226. return buf[:size]
  227. }
  228. func (dr *DictionaryRow) ValueSize() int {
  229. return DictionaryRowMaxValueSize
  230. }
  231. func (dr *DictionaryRow) ValueTo(buf []byte) (int, error) {
  232. used := binary.PutUvarint(buf, dr.count)
  233. return used, nil
  234. }
  235. func (dr *DictionaryRow) String() string {
  236. return fmt.Sprintf("Dictionary Term: `%s` Field: %d Count: %d ", string(dr.term), dr.field, dr.count)
  237. }
  238. func NewDictionaryRow(term []byte, field uint16, count uint64) *DictionaryRow {
  239. return &DictionaryRow{
  240. term: term,
  241. field: field,
  242. count: count,
  243. }
  244. }
  245. func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) {
  246. rv, err := NewDictionaryRowK(key)
  247. if err != nil {
  248. return nil, err
  249. }
  250. err = rv.parseDictionaryV(value)
  251. if err != nil {
  252. return nil, err
  253. }
  254. return rv, nil
  255. }
  256. func NewDictionaryRowK(key []byte) (*DictionaryRow, error) {
  257. rv := &DictionaryRow{}
  258. err := rv.parseDictionaryK(key)
  259. if err != nil {
  260. return nil, err
  261. }
  262. return rv, nil
  263. }
  264. func (dr *DictionaryRow) parseDictionaryK(key []byte) error {
  265. dr.field = binary.LittleEndian.Uint16(key[1:3])
  266. if dr.term != nil {
  267. dr.term = dr.term[:0]
  268. }
  269. dr.term = append(dr.term, key[3:]...)
  270. return nil
  271. }
  272. func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
  273. count, err := dictionaryRowParseV(value)
  274. if err != nil {
  275. return err
  276. }
  277. dr.count = count
  278. return nil
  279. }
  280. func dictionaryRowParseV(value []byte) (uint64, error) {
  281. count, nread := binary.Uvarint(value)
  282. if nread <= 0 {
  283. return 0, fmt.Errorf("DictionaryRow parse Uvarint error, nread: %d", nread)
  284. }
  285. return count, nil
  286. }
  287. // TERM FIELD FREQUENCY
  288. type TermVector struct {
  289. field uint16
  290. arrayPositions []uint64
  291. pos uint64
  292. start uint64
  293. end uint64
  294. }
  295. func (tv *TermVector) String() string {
  296. return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
  297. }
  298. type TermFrequencyRow struct {
  299. term []byte
  300. doc []byte
  301. freq uint64
  302. vectors []*TermVector
  303. norm float32
  304. field uint16
  305. }
  306. func (tfr *TermFrequencyRow) Term() []byte {
  307. return tfr.term
  308. }
  309. func (tfr *TermFrequencyRow) Freq() uint64 {
  310. return tfr.freq
  311. }
  312. func (tfr *TermFrequencyRow) ScanPrefixForField() []byte {
  313. buf := make([]byte, 3)
  314. buf[0] = 't'
  315. binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
  316. return buf
  317. }
  318. func (tfr *TermFrequencyRow) ScanPrefixForFieldTermPrefix() []byte {
  319. buf := make([]byte, 3+len(tfr.term))
  320. buf[0] = 't'
  321. binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
  322. copy(buf[3:], tfr.term)
  323. return buf
  324. }
  325. func (tfr *TermFrequencyRow) ScanPrefixForFieldTerm() []byte {
  326. buf := make([]byte, 3+len(tfr.term)+1)
  327. buf[0] = 't'
  328. binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
  329. termLen := copy(buf[3:], tfr.term)
  330. buf[3+termLen] = ByteSeparator
  331. return buf
  332. }
  333. func (tfr *TermFrequencyRow) Key() []byte {
  334. buf := make([]byte, tfr.KeySize())
  335. size, _ := tfr.KeyTo(buf)
  336. return buf[:size]
  337. }
  338. func (tfr *TermFrequencyRow) KeySize() int {
  339. return termFrequencyRowKeySize(tfr.term, tfr.doc)
  340. }
  341. func termFrequencyRowKeySize(term, doc []byte) int {
  342. return 3 + len(term) + 1 + len(doc)
  343. }
  344. func (tfr *TermFrequencyRow) KeyTo(buf []byte) (int, error) {
  345. return termFrequencyRowKeyTo(buf, tfr.field, tfr.term, tfr.doc), nil
  346. }
  347. func termFrequencyRowKeyTo(buf []byte, field uint16, term, doc []byte) int {
  348. buf[0] = 't'
  349. binary.LittleEndian.PutUint16(buf[1:3], field)
  350. termLen := copy(buf[3:], term)
  351. buf[3+termLen] = ByteSeparator
  352. docLen := copy(buf[3+termLen+1:], doc)
  353. return 3 + termLen + 1 + docLen
  354. }
  355. func (tfr *TermFrequencyRow) KeyAppendTo(buf []byte) ([]byte, error) {
  356. keySize := tfr.KeySize()
  357. if cap(buf) < keySize {
  358. buf = make([]byte, keySize)
  359. }
  360. actualSize, err := tfr.KeyTo(buf[0:keySize])
  361. return buf[0:actualSize], err
  362. }
  363. func (tfr *TermFrequencyRow) DictionaryRowKey() []byte {
  364. dr := NewDictionaryRow(tfr.term, tfr.field, 0)
  365. return dr.Key()
  366. }
  367. func (tfr *TermFrequencyRow) DictionaryRowKeySize() int {
  368. dr := NewDictionaryRow(tfr.term, tfr.field, 0)
  369. return dr.KeySize()
  370. }
  371. func (tfr *TermFrequencyRow) DictionaryRowKeyTo(buf []byte) (int, error) {
  372. dr := NewDictionaryRow(tfr.term, tfr.field, 0)
  373. return dr.KeyTo(buf)
  374. }
  375. func (tfr *TermFrequencyRow) Value() []byte {
  376. buf := make([]byte, tfr.ValueSize())
  377. size, _ := tfr.ValueTo(buf)
  378. return buf[:size]
  379. }
  380. func (tfr *TermFrequencyRow) ValueSize() int {
  381. bufLen := binary.MaxVarintLen64 + binary.MaxVarintLen64
  382. for _, vector := range tfr.vectors {
  383. bufLen += (binary.MaxVarintLen64 * 4) + (1+len(vector.arrayPositions))*binary.MaxVarintLen64
  384. }
  385. return bufLen
  386. }
  387. func (tfr *TermFrequencyRow) ValueTo(buf []byte) (int, error) {
  388. used := binary.PutUvarint(buf[:binary.MaxVarintLen64], tfr.freq)
  389. normuint32 := math.Float32bits(tfr.norm)
  390. newbuf := buf[used : used+binary.MaxVarintLen64]
  391. used += binary.PutUvarint(newbuf, uint64(normuint32))
  392. for _, vector := range tfr.vectors {
  393. used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(vector.field))
  394. used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.pos)
  395. used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.start)
  396. used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.end)
  397. used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(len(vector.arrayPositions)))
  398. for _, arrayPosition := range vector.arrayPositions {
  399. used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], arrayPosition)
  400. }
  401. }
  402. return used, nil
  403. }
  404. func (tfr *TermFrequencyRow) String() string {
  405. return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
  406. }
  407. func InitTermFrequencyRow(tfr *TermFrequencyRow, term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow {
  408. tfr.term = term
  409. tfr.field = field
  410. tfr.doc = docID
  411. tfr.freq = freq
  412. tfr.norm = norm
  413. return tfr
  414. }
  415. func NewTermFrequencyRow(term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow {
  416. return &TermFrequencyRow{
  417. term: term,
  418. field: field,
  419. doc: docID,
  420. freq: freq,
  421. norm: norm,
  422. }
  423. }
  424. func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docID []byte, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
  425. return &TermFrequencyRow{
  426. term: term,
  427. field: field,
  428. doc: docID,
  429. freq: freq,
  430. norm: norm,
  431. vectors: vectors,
  432. }
  433. }
  434. func NewTermFrequencyRowK(key []byte) (*TermFrequencyRow, error) {
  435. rv := &TermFrequencyRow{}
  436. err := rv.parseK(key)
  437. if err != nil {
  438. return nil, err
  439. }
  440. return rv, nil
  441. }
  442. func (tfr *TermFrequencyRow) parseK(key []byte) error {
  443. keyLen := len(key)
  444. if keyLen < 3 {
  445. return fmt.Errorf("invalid term frequency key, no valid field")
  446. }
  447. tfr.field = binary.LittleEndian.Uint16(key[1:3])
  448. termEndPos := bytes.IndexByte(key[3:], ByteSeparator)
  449. if termEndPos < 0 {
  450. return fmt.Errorf("invalid term frequency key, no byte separator terminating term")
  451. }
  452. tfr.term = key[3 : 3+termEndPos]
  453. docLen := keyLen - (3 + termEndPos + 1)
  454. if docLen < 1 {
  455. return fmt.Errorf("invalid term frequency key, empty docid")
  456. }
  457. tfr.doc = key[3+termEndPos+1:]
  458. return nil
  459. }
  460. func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error {
  461. tfr.doc = key[3+len(term)+1:]
  462. if len(tfr.doc) <= 0 {
  463. return fmt.Errorf("invalid term frequency key, empty docid")
  464. }
  465. return nil
  466. }
  467. func (tfr *TermFrequencyRow) parseV(value []byte, includeTermVectors bool) error {
  468. var bytesRead int
  469. tfr.freq, bytesRead = binary.Uvarint(value)
  470. if bytesRead <= 0 {
  471. return fmt.Errorf("invalid term frequency value, invalid frequency")
  472. }
  473. currOffset := bytesRead
  474. var norm uint64
  475. norm, bytesRead = binary.Uvarint(value[currOffset:])
  476. if bytesRead <= 0 {
  477. return fmt.Errorf("invalid term frequency value, no norm")
  478. }
  479. currOffset += bytesRead
  480. tfr.norm = math.Float32frombits(uint32(norm))
  481. tfr.vectors = nil
  482. if !includeTermVectors {
  483. return nil
  484. }
  485. var field uint64
  486. field, bytesRead = binary.Uvarint(value[currOffset:])
  487. for bytesRead > 0 {
  488. currOffset += bytesRead
  489. tv := TermVector{}
  490. tv.field = uint16(field)
  491. // at this point we expect at least one term vector
  492. if tfr.vectors == nil {
  493. tfr.vectors = make([]*TermVector, 0)
  494. }
  495. tv.pos, bytesRead = binary.Uvarint(value[currOffset:])
  496. if bytesRead <= 0 {
  497. return fmt.Errorf("invalid term frequency value, vector contains no position")
  498. }
  499. currOffset += bytesRead
  500. tv.start, bytesRead = binary.Uvarint(value[currOffset:])
  501. if bytesRead <= 0 {
  502. return fmt.Errorf("invalid term frequency value, vector contains no start")
  503. }
  504. currOffset += bytesRead
  505. tv.end, bytesRead = binary.Uvarint(value[currOffset:])
  506. if bytesRead <= 0 {
  507. return fmt.Errorf("invalid term frequency value, vector contains no end")
  508. }
  509. currOffset += bytesRead
  510. var arrayPositionsLen uint64 = 0
  511. arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:])
  512. if bytesRead <= 0 {
  513. return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen")
  514. }
  515. currOffset += bytesRead
  516. if arrayPositionsLen > 0 {
  517. tv.arrayPositions = make([]uint64, arrayPositionsLen)
  518. for i := 0; uint64(i) < arrayPositionsLen; i++ {
  519. tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:])
  520. if bytesRead <= 0 {
  521. return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i)
  522. }
  523. currOffset += bytesRead
  524. }
  525. }
  526. tfr.vectors = append(tfr.vectors, &tv)
  527. // try to read next record (may not exist)
  528. field, bytesRead = binary.Uvarint(value[currOffset:])
  529. }
  530. if len(value[currOffset:]) > 0 && bytesRead <= 0 {
  531. return fmt.Errorf("invalid term frequency value, vector field invalid")
  532. }
  533. return nil
  534. }
  535. func NewTermFrequencyRowKV(key, value []byte) (*TermFrequencyRow, error) {
  536. rv, err := NewTermFrequencyRowK(key)
  537. if err != nil {
  538. return nil, err
  539. }
  540. err = rv.parseV(value, true)
  541. if err != nil {
  542. return nil, err
  543. }
  544. return rv, nil
  545. }
  546. type BackIndexRow struct {
  547. doc []byte
  548. termsEntries []*BackIndexTermsEntry
  549. storedEntries []*BackIndexStoreEntry
  550. }
  551. func (br *BackIndexRow) AllTermKeys() [][]byte {
  552. if br == nil {
  553. return nil
  554. }
  555. rv := make([][]byte, 0, len(br.termsEntries)) // FIXME this underestimates severely
  556. for _, termsEntry := range br.termsEntries {
  557. for i := range termsEntry.Terms {
  558. termRow := NewTermFrequencyRow([]byte(termsEntry.Terms[i]), uint16(termsEntry.GetField()), br.doc, 0, 0)
  559. rv = append(rv, termRow.Key())
  560. }
  561. }
  562. return rv
  563. }
  564. func (br *BackIndexRow) AllStoredKeys() [][]byte {
  565. if br == nil {
  566. return nil
  567. }
  568. rv := make([][]byte, len(br.storedEntries))
  569. for i, storedEntry := range br.storedEntries {
  570. storedRow := NewStoredRow(br.doc, uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
  571. rv[i] = storedRow.Key()
  572. }
  573. return rv
  574. }
  575. func (br *BackIndexRow) Key() []byte {
  576. buf := make([]byte, br.KeySize())
  577. size, _ := br.KeyTo(buf)
  578. return buf[:size]
  579. }
  580. func (br *BackIndexRow) KeySize() int {
  581. return len(br.doc) + 1
  582. }
  583. func (br *BackIndexRow) KeyTo(buf []byte) (int, error) {
  584. buf[0] = 'b'
  585. used := copy(buf[1:], br.doc)
  586. return used + 1, nil
  587. }
  588. func (br *BackIndexRow) Value() []byte {
  589. buf := make([]byte, br.ValueSize())
  590. size, _ := br.ValueTo(buf)
  591. return buf[:size]
  592. }
  593. func (br *BackIndexRow) ValueSize() int {
  594. birv := &BackIndexRowValue{
  595. TermsEntries: br.termsEntries,
  596. StoredEntries: br.storedEntries,
  597. }
  598. return birv.Size()
  599. }
  600. func (br *BackIndexRow) ValueTo(buf []byte) (int, error) {
  601. birv := &BackIndexRowValue{
  602. TermsEntries: br.termsEntries,
  603. StoredEntries: br.storedEntries,
  604. }
  605. return birv.MarshalTo(buf)
  606. }
  607. func (br *BackIndexRow) String() string {
  608. return fmt.Sprintf("Backindex DocId: `%s` Terms Entries: %v, Stored Entries: %v", string(br.doc), br.termsEntries, br.storedEntries)
  609. }
  610. func NewBackIndexRow(docID []byte, entries []*BackIndexTermsEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
  611. return &BackIndexRow{
  612. doc: docID,
  613. termsEntries: entries,
  614. storedEntries: storedFields,
  615. }
  616. }
  617. func NewBackIndexRowKV(key, value []byte) (*BackIndexRow, error) {
  618. rv := BackIndexRow{}
  619. buf := bytes.NewBuffer(key)
  620. _, err := buf.ReadByte() // type
  621. if err != nil {
  622. return nil, err
  623. }
  624. rv.doc, err = buf.ReadBytes(ByteSeparator)
  625. if err == io.EOF && len(rv.doc) < 1 {
  626. err = fmt.Errorf("invalid doc length 0 - % x", key)
  627. }
  628. if err != nil && err != io.EOF {
  629. return nil, err
  630. } else if err == nil {
  631. rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
  632. }
  633. var birv BackIndexRowValue
  634. err = proto.Unmarshal(value, &birv)
  635. if err != nil {
  636. return nil, err
  637. }
  638. rv.termsEntries = birv.TermsEntries
  639. rv.storedEntries = birv.StoredEntries
  640. return &rv, nil
  641. }
  642. // STORED
  643. type StoredRow struct {
  644. doc []byte
  645. field uint16
  646. arrayPositions []uint64
  647. typ byte
  648. value []byte
  649. }
  650. func (s *StoredRow) Key() []byte {
  651. buf := make([]byte, s.KeySize())
  652. size, _ := s.KeyTo(buf)
  653. return buf[0:size]
  654. }
  655. func (s *StoredRow) KeySize() int {
  656. return 1 + len(s.doc) + 1 + 2 + (binary.MaxVarintLen64 * len(s.arrayPositions))
  657. }
  658. func (s *StoredRow) KeyTo(buf []byte) (int, error) {
  659. docLen := len(s.doc)
  660. buf[0] = 's'
  661. copy(buf[1:], s.doc)
  662. buf[1+docLen] = ByteSeparator
  663. binary.LittleEndian.PutUint16(buf[1+docLen+1:], s.field)
  664. bytesUsed := 1 + docLen + 1 + 2
  665. for _, arrayPosition := range s.arrayPositions {
  666. varbytes := binary.PutUvarint(buf[bytesUsed:], arrayPosition)
  667. bytesUsed += varbytes
  668. }
  669. return bytesUsed, nil
  670. }
  671. func (s *StoredRow) Value() []byte {
  672. buf := make([]byte, s.ValueSize())
  673. size, _ := s.ValueTo(buf)
  674. return buf[:size]
  675. }
  676. func (s *StoredRow) ValueSize() int {
  677. return len(s.value) + 1
  678. }
  679. func (s *StoredRow) ValueTo(buf []byte) (int, error) {
  680. buf[0] = s.typ
  681. used := copy(buf[1:], s.value)
  682. return used + 1, nil
  683. }
  684. func (s *StoredRow) String() string {
  685. return fmt.Sprintf("Document: %s Field %d, Array Positions: %v, Type: %s Value: %s", s.doc, s.field, s.arrayPositions, string(s.typ), s.value)
  686. }
  687. func (s *StoredRow) ScanPrefixForDoc() []byte {
  688. docLen := len(s.doc)
  689. buf := make([]byte, 1+docLen+1)
  690. buf[0] = 's'
  691. copy(buf[1:], s.doc)
  692. buf[1+docLen] = ByteSeparator
  693. return buf
  694. }
  695. func NewStoredRow(docID []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
  696. return &StoredRow{
  697. doc: docID,
  698. field: field,
  699. arrayPositions: arrayPositions,
  700. typ: typ,
  701. value: value,
  702. }
  703. }
  704. func NewStoredRowK(key []byte) (*StoredRow, error) {
  705. rv := StoredRow{}
  706. buf := bytes.NewBuffer(key)
  707. _, err := buf.ReadByte() // type
  708. if err != nil {
  709. return nil, err
  710. }
  711. rv.doc, err = buf.ReadBytes(ByteSeparator)
  712. if len(rv.doc) < 2 { // 1 for min doc id length, 1 for separator
  713. err = fmt.Errorf("invalid doc length 0")
  714. return nil, err
  715. }
  716. rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
  717. err = binary.Read(buf, binary.LittleEndian, &rv.field)
  718. if err != nil {
  719. return nil, err
  720. }
  721. rv.arrayPositions = make([]uint64, 0)
  722. nextArrayPos, err := binary.ReadUvarint(buf)
  723. for err == nil {
  724. rv.arrayPositions = append(rv.arrayPositions, nextArrayPos)
  725. nextArrayPos, err = binary.ReadUvarint(buf)
  726. }
  727. return &rv, nil
  728. }
  729. func NewStoredRowKV(key, value []byte) (*StoredRow, error) {
  730. rv, err := NewStoredRowK(key)
  731. if err != nil {
  732. return nil, err
  733. }
  734. rv.typ = value[0]
  735. rv.value = value[1:]
  736. return rv, nil
  737. }
  738. type backIndexFieldTermVisitor func(field uint32, term []byte)
  739. // visitBackIndexRow is designed to process a protobuf encoded
  740. // value, without creating unnecessary garbage. Instead values are passed
  741. // to a callback, inspected first, and only copied if necessary.
  742. // Due to the fact that this borrows from generated code, it must be marnually
  743. // updated if the protobuf definition changes.
  744. //
  745. // This code originates from:
  746. // func (m *BackIndexRowValue) Unmarshal(data []byte) error
  747. // the sections which create garbage or parse unintersting sections
  748. // have been commented out. This was done by design to allow for easier
  749. // merging in the future if that original function is regenerated
  750. func visitBackIndexRow(data []byte, callback backIndexFieldTermVisitor) error {
  751. l := len(data)
  752. iNdEx := 0
  753. for iNdEx < l {
  754. var wire uint64
  755. for shift := uint(0); ; shift += 7 {
  756. if iNdEx >= l {
  757. return io.ErrUnexpectedEOF
  758. }
  759. b := data[iNdEx]
  760. iNdEx++
  761. wire |= (uint64(b) & 0x7F) << shift
  762. if b < 0x80 {
  763. break
  764. }
  765. }
  766. fieldNum := int32(wire >> 3)
  767. wireType := int(wire & 0x7)
  768. switch fieldNum {
  769. case 1:
  770. if wireType != 2 {
  771. return fmt.Errorf("proto: wrong wireType = %d for field TermsEntries", wireType)
  772. }
  773. var msglen int
  774. for shift := uint(0); ; shift += 7 {
  775. if iNdEx >= l {
  776. return io.ErrUnexpectedEOF
  777. }
  778. b := data[iNdEx]
  779. iNdEx++
  780. msglen |= (int(b) & 0x7F) << shift
  781. if b < 0x80 {
  782. break
  783. }
  784. }
  785. postIndex := iNdEx + msglen
  786. if msglen < 0 {
  787. return ErrInvalidLengthUpsidedown
  788. }
  789. if postIndex > l {
  790. return io.ErrUnexpectedEOF
  791. }
  792. // dont parse term entries
  793. // m.TermsEntries = append(m.TermsEntries, &BackIndexTermsEntry{})
  794. // if err := m.TermsEntries[len(m.TermsEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
  795. // return err
  796. // }
  797. // instead, inspect them
  798. if err := visitBackIndexRowFieldTerms(data[iNdEx:postIndex], callback); err != nil {
  799. return err
  800. }
  801. iNdEx = postIndex
  802. case 2:
  803. if wireType != 2 {
  804. return fmt.Errorf("proto: wrong wireType = %d for field StoredEntries", wireType)
  805. }
  806. var msglen int
  807. for shift := uint(0); ; shift += 7 {
  808. if iNdEx >= l {
  809. return io.ErrUnexpectedEOF
  810. }
  811. b := data[iNdEx]
  812. iNdEx++
  813. msglen |= (int(b) & 0x7F) << shift
  814. if b < 0x80 {
  815. break
  816. }
  817. }
  818. postIndex := iNdEx + msglen
  819. if msglen < 0 {
  820. return ErrInvalidLengthUpsidedown
  821. }
  822. if postIndex > l {
  823. return io.ErrUnexpectedEOF
  824. }
  825. // don't parse stored entries
  826. // m.StoredEntries = append(m.StoredEntries, &BackIndexStoreEntry{})
  827. // if err := m.StoredEntries[len(m.StoredEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
  828. // return err
  829. // }
  830. iNdEx = postIndex
  831. default:
  832. var sizeOfWire int
  833. for {
  834. sizeOfWire++
  835. wire >>= 7
  836. if wire == 0 {
  837. break
  838. }
  839. }
  840. iNdEx -= sizeOfWire
  841. skippy, err := skipUpsidedown(data[iNdEx:])
  842. if err != nil {
  843. return err
  844. }
  845. if skippy < 0 {
  846. return ErrInvalidLengthUpsidedown
  847. }
  848. if (iNdEx + skippy) > l {
  849. return io.ErrUnexpectedEOF
  850. }
  851. // don't track unrecognized data
  852. //m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...)
  853. iNdEx += skippy
  854. }
  855. }
  856. return nil
  857. }
  858. // visitBackIndexRowFieldTerms is designed to process a protobuf encoded
  859. // sub-value within the BackIndexRowValue, without creating unnecessary garbage.
  860. // Instead values are passed to a callback, inspected first, and only copied if
  861. // necessary. Due to the fact that this borrows from generated code, it must
  862. // be marnually updated if the protobuf definition changes.
  863. //
  864. // This code originates from:
  865. // func (m *BackIndexTermsEntry) Unmarshal(data []byte) error {
  866. // the sections which create garbage or parse uninteresting sections
  867. // have been commented out. This was done by design to allow for easier
  868. // merging in the future if that original function is regenerated
  869. func visitBackIndexRowFieldTerms(data []byte, callback backIndexFieldTermVisitor) error {
  870. var theField uint32
  871. var hasFields [1]uint64
  872. l := len(data)
  873. iNdEx := 0
  874. for iNdEx < l {
  875. var wire uint64
  876. for shift := uint(0); ; shift += 7 {
  877. if iNdEx >= l {
  878. return io.ErrUnexpectedEOF
  879. }
  880. b := data[iNdEx]
  881. iNdEx++
  882. wire |= (uint64(b) & 0x7F) << shift
  883. if b < 0x80 {
  884. break
  885. }
  886. }
  887. fieldNum := int32(wire >> 3)
  888. wireType := int(wire & 0x7)
  889. switch fieldNum {
  890. case 1:
  891. if wireType != 0 {
  892. return fmt.Errorf("proto: wrong wireType = %d for field Field", wireType)
  893. }
  894. var v uint32
  895. for shift := uint(0); ; shift += 7 {
  896. if iNdEx >= l {
  897. return io.ErrUnexpectedEOF
  898. }
  899. b := data[iNdEx]
  900. iNdEx++
  901. v |= (uint32(b) & 0x7F) << shift
  902. if b < 0x80 {
  903. break
  904. }
  905. }
  906. // m.Field = &v
  907. theField = v
  908. hasFields[0] |= uint64(0x00000001)
  909. case 2:
  910. if wireType != 2 {
  911. return fmt.Errorf("proto: wrong wireType = %d for field Terms", wireType)
  912. }
  913. var stringLen uint64
  914. for shift := uint(0); ; shift += 7 {
  915. if iNdEx >= l {
  916. return io.ErrUnexpectedEOF
  917. }
  918. b := data[iNdEx]
  919. iNdEx++
  920. stringLen |= (uint64(b) & 0x7F) << shift
  921. if b < 0x80 {
  922. break
  923. }
  924. }
  925. postIndex := iNdEx + int(stringLen)
  926. if postIndex > l {
  927. return io.ErrUnexpectedEOF
  928. }
  929. //m.Terms = append(m.Terms, string(data[iNdEx:postIndex]))
  930. callback(theField, data[iNdEx:postIndex])
  931. iNdEx = postIndex
  932. default:
  933. var sizeOfWire int
  934. for {
  935. sizeOfWire++
  936. wire >>= 7
  937. if wire == 0 {
  938. break
  939. }
  940. }
  941. iNdEx -= sizeOfWire
  942. skippy, err := skipUpsidedown(data[iNdEx:])
  943. if err != nil {
  944. return err
  945. }
  946. if skippy < 0 {
  947. return ErrInvalidLengthUpsidedown
  948. }
  949. if (iNdEx + skippy) > l {
  950. return io.ErrUnexpectedEOF
  951. }
  952. //m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...)
  953. iNdEx += skippy
  954. }
  955. }
  956. // if hasFields[0]&uint64(0x00000001) == 0 {
  957. // return new(github_com_golang_protobuf_proto.RequiredNotSetError)
  958. // }
  959. return nil
  960. }