You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

character.go 1.8KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. // Copyright (c) 2016 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package character
  15. import (
  16. "unicode/utf8"
  17. "github.com/blevesearch/bleve/analysis"
  18. )
  19. type IsTokenRune func(r rune) bool
  20. type CharacterTokenizer struct {
  21. isTokenRun IsTokenRune
  22. }
  23. func NewCharacterTokenizer(f IsTokenRune) *CharacterTokenizer {
  24. return &CharacterTokenizer{
  25. isTokenRun: f,
  26. }
  27. }
  28. func (c *CharacterTokenizer) Tokenize(input []byte) analysis.TokenStream {
  29. rv := make(analysis.TokenStream, 0, 1024)
  30. offset := 0
  31. start := 0
  32. end := 0
  33. count := 0
  34. for currRune, size := utf8.DecodeRune(input[offset:]); currRune != utf8.RuneError; currRune, size = utf8.DecodeRune(input[offset:]) {
  35. isToken := c.isTokenRun(currRune)
  36. if isToken {
  37. end = offset + size
  38. } else {
  39. if end-start > 0 {
  40. // build token
  41. rv = append(rv, &analysis.Token{
  42. Term: input[start:end],
  43. Start: start,
  44. End: end,
  45. Position: count + 1,
  46. Type: analysis.AlphaNumeric,
  47. })
  48. count++
  49. }
  50. start = offset + size
  51. end = start
  52. }
  53. offset += size
  54. }
  55. // if we ended in the middle of a token, finish it
  56. if end-start > 0 {
  57. // build token
  58. rv = append(rv, &analysis.Token{
  59. Term: input[start:end],
  60. Start: start,
  61. End: end,
  62. Position: count + 1,
  63. Type: analysis.AlphaNumeric,
  64. })
  65. }
  66. return rv
  67. }