You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

utf8.go 6.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package utf8
  15. import (
  16. "fmt"
  17. "unicode/utf8"
  18. )
  19. // Sequences is a collection of Sequence
  20. type Sequences []Sequence
  21. // NewSequences constructs a collection of Sequence which describe the
  22. // byte ranges covered between the start and end runes.
  23. func NewSequences(start, end rune) (Sequences, error) {
  24. rv, _, err := NewSequencesPrealloc(start, end, nil, nil, nil, nil)
  25. return rv, err
  26. }
  27. func NewSequencesPrealloc(start, end rune,
  28. preallocSequences Sequences,
  29. preallocRangeStack RangeStack,
  30. preallocStartBytes, preallocEndBytes []byte) (Sequences, RangeStack, error) {
  31. rv := preallocSequences[:0]
  32. startBytes := preallocStartBytes
  33. if cap(startBytes) < utf8.UTFMax {
  34. startBytes = make([]byte, utf8.UTFMax)
  35. }
  36. startBytes = startBytes[:utf8.UTFMax]
  37. endBytes := preallocEndBytes
  38. if cap(endBytes) < utf8.UTFMax {
  39. endBytes = make([]byte, utf8.UTFMax)
  40. }
  41. endBytes = endBytes[:utf8.UTFMax]
  42. rangeStack := preallocRangeStack[:0]
  43. rangeStack = rangeStack.Push(scalarRange{start, end})
  44. rangeStack, r := rangeStack.Pop()
  45. TOP:
  46. for r != nilScalarRange {
  47. INNER:
  48. for {
  49. r1, r2 := r.split()
  50. if r1 != nilScalarRange {
  51. rangeStack = rangeStack.Push(scalarRange{r2.start, r2.end})
  52. r.start = r1.start
  53. r.end = r1.end
  54. continue INNER
  55. }
  56. if !r.valid() {
  57. rangeStack, r = rangeStack.Pop()
  58. continue TOP
  59. }
  60. for i := 1; i < utf8.UTFMax; i++ {
  61. max := maxScalarValue(i)
  62. if r.start <= max && max < r.end {
  63. rangeStack = rangeStack.Push(scalarRange{max + 1, r.end})
  64. r.end = max
  65. continue INNER
  66. }
  67. }
  68. asciiRange := r.ascii()
  69. if asciiRange != nilRange {
  70. rv = append(rv, Sequence{
  71. asciiRange,
  72. })
  73. rangeStack, r = rangeStack.Pop()
  74. continue TOP
  75. }
  76. for i := uint(1); i < utf8.UTFMax; i++ {
  77. m := rune((1 << (6 * i)) - 1)
  78. if (r.start & ^m) != (r.end & ^m) {
  79. if (r.start & m) != 0 {
  80. rangeStack = rangeStack.Push(scalarRange{(r.start | m) + 1, r.end})
  81. r.end = r.start | m
  82. continue INNER
  83. }
  84. if (r.end & m) != m {
  85. rangeStack = rangeStack.Push(scalarRange{r.end & ^m, r.end})
  86. r.end = (r.end & ^m) - 1
  87. continue INNER
  88. }
  89. }
  90. }
  91. n, m := r.encode(startBytes, endBytes)
  92. seq, err := SequenceFromEncodedRange(startBytes[0:n], endBytes[0:m])
  93. if err != nil {
  94. return nil, nil, err
  95. }
  96. rv = append(rv, seq)
  97. rangeStack, r = rangeStack.Pop()
  98. continue TOP
  99. }
  100. }
  101. return rv, rangeStack, nil
  102. }
  103. // Sequence is a collection of Range
  104. type Sequence []Range
  105. // SequenceFromEncodedRange creates sequence from the encoded bytes
  106. func SequenceFromEncodedRange(start, end []byte) (Sequence, error) {
  107. if len(start) != len(end) {
  108. return nil, fmt.Errorf("byte slices must be the same length")
  109. }
  110. switch len(start) {
  111. case 2:
  112. return Sequence{
  113. Range{start[0], end[0]},
  114. Range{start[1], end[1]},
  115. }, nil
  116. case 3:
  117. return Sequence{
  118. Range{start[0], end[0]},
  119. Range{start[1], end[1]},
  120. Range{start[2], end[2]},
  121. }, nil
  122. case 4:
  123. return Sequence{
  124. Range{start[0], end[0]},
  125. Range{start[1], end[1]},
  126. Range{start[2], end[2]},
  127. Range{start[3], end[3]},
  128. }, nil
  129. }
  130. return nil, fmt.Errorf("invalid encoded byte length")
  131. }
  132. // Matches checks to see if the provided byte slice matches the Sequence
  133. func (u Sequence) Matches(bytes []byte) bool {
  134. if len(bytes) < len(u) {
  135. return false
  136. }
  137. for i := 0; i < len(u); i++ {
  138. if !u[i].matches(bytes[i]) {
  139. return false
  140. }
  141. }
  142. return true
  143. }
  144. func (u Sequence) String() string {
  145. switch len(u) {
  146. case 1:
  147. return fmt.Sprintf("%v", u[0])
  148. case 2:
  149. return fmt.Sprintf("%v%v", u[0], u[1])
  150. case 3:
  151. return fmt.Sprintf("%v%v%v", u[0], u[1], u[2])
  152. case 4:
  153. return fmt.Sprintf("%v%v%v%v", u[0], u[1], u[2], u[3])
  154. default:
  155. return fmt.Sprintf("invalid utf8 sequence")
  156. }
  157. }
  158. // Range describes a single range of byte values
  159. type Range struct {
  160. Start byte
  161. End byte
  162. }
  163. var nilRange = Range{0xff, 0}
  164. func (u Range) matches(b byte) bool {
  165. if u.Start <= b && b <= u.End {
  166. return true
  167. }
  168. return false
  169. }
  170. func (u Range) String() string {
  171. if u.Start == u.End {
  172. return fmt.Sprintf("[%X]", u.Start)
  173. }
  174. return fmt.Sprintf("[%X-%X]", u.Start, u.End)
  175. }
  176. type scalarRange struct {
  177. start rune
  178. end rune
  179. }
  180. var nilScalarRange = scalarRange{0xffff, 0}
  181. func (s *scalarRange) String() string {
  182. return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end)
  183. }
  184. // split this scalar range if it overlaps with a surrogate codepoint
  185. func (s *scalarRange) split() (scalarRange, scalarRange) {
  186. if s.start < 0xe000 && s.end > 0xd7ff {
  187. return scalarRange{
  188. start: s.start,
  189. end: 0xd7ff,
  190. },
  191. scalarRange{
  192. start: 0xe000,
  193. end: s.end,
  194. }
  195. }
  196. return nilScalarRange, nilScalarRange
  197. }
  198. func (s *scalarRange) valid() bool {
  199. return s.start <= s.end
  200. }
  201. func (s *scalarRange) ascii() Range {
  202. if s.valid() && s.end <= 0x7f {
  203. return Range{
  204. Start: byte(s.start),
  205. End: byte(s.end),
  206. }
  207. }
  208. return nilRange
  209. }
  210. // start and end MUST have capacity for utf8.UTFMax bytes
  211. func (s *scalarRange) encode(start, end []byte) (int, int) {
  212. n := utf8.EncodeRune(start, s.start)
  213. m := utf8.EncodeRune(end, s.end)
  214. return n, m
  215. }
  216. type RangeStack []scalarRange
  217. func (s RangeStack) Push(v scalarRange) RangeStack {
  218. return append(s, v)
  219. }
  220. func (s RangeStack) Pop() (RangeStack, scalarRange) {
  221. l := len(s)
  222. if l < 1 {
  223. return s, nilScalarRange
  224. }
  225. return s[:l-1], s[l-1]
  226. }
  227. func maxScalarValue(nbytes int) rune {
  228. switch nbytes {
  229. case 1:
  230. return 0x007f
  231. case 2:
  232. return 0x07FF
  233. case 3:
  234. return 0xFFFF
  235. default:
  236. return 0x10FFFF
  237. }
  238. }