summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/couchbase/vellum/utf8/utf8.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/couchbase/vellum/utf8/utf8.go')
-rw-r--r--vendor/github.com/couchbase/vellum/utf8/utf8.go246
1 files changed, 246 insertions, 0 deletions
diff --git a/vendor/github.com/couchbase/vellum/utf8/utf8.go b/vendor/github.com/couchbase/vellum/utf8/utf8.go
new file mode 100644
index 0000000000..47dbe9d1c5
--- /dev/null
+++ b/vendor/github.com/couchbase/vellum/utf8/utf8.go
@@ -0,0 +1,246 @@
+// Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utf8
+
+import (
+ "fmt"
+ "unicode/utf8"
+)
+
+// Sequences is a collection of Sequence
+type Sequences []Sequence
+
+// NewSequences constructs a collection of Sequence which describe the
+// byte ranges covered between the start and end runes.
+func NewSequences(start, end rune) (Sequences, error) {
+ var rv Sequences
+
+ var rangeStack rangeStack
+ rangeStack = rangeStack.Push(&scalarRange{start, end})
+
+ rangeStack, r := rangeStack.Pop()
+TOP:
+ for r != nil {
+ INNER:
+ for {
+ r1, r2 := r.split()
+ if r1 != nil {
+ rangeStack = rangeStack.Push(&scalarRange{r2.start, r2.end})
+ r.start = r1.start
+ r.end = r1.end
+ continue INNER
+ }
+ if !r.valid() {
+ rangeStack, r = rangeStack.Pop()
+ continue TOP
+ }
+ for i := 1; i < utf8.UTFMax; i++ {
+ max := maxScalarValue(i)
+ if r.start <= max && max < r.end {
+ rangeStack = rangeStack.Push(&scalarRange{max + 1, r.end})
+ r.end = max
+ continue INNER
+ }
+ }
+ asciiRange := r.ascii()
+ if asciiRange != nil {
+ rv = append(rv, Sequence{
+ asciiRange,
+ })
+ rangeStack, r = rangeStack.Pop()
+ continue TOP
+ }
+ for i := uint(1); i < utf8.UTFMax; i++ {
+ m := rune((1 << (6 * i)) - 1)
+ if (r.start & ^m) != (r.end & ^m) {
+ if (r.start & m) != 0 {
+ rangeStack = rangeStack.Push(&scalarRange{(r.start | m) + 1, r.end})
+ r.end = r.start | m
+ continue INNER
+ }
+ if (r.end & m) != m {
+ rangeStack = rangeStack.Push(&scalarRange{r.end & ^m, r.end})
+ r.end = (r.end & ^m) - 1
+ continue INNER
+ }
+ }
+ }
+ start := make([]byte, utf8.UTFMax)
+ end := make([]byte, utf8.UTFMax)
+ n, m := r.encode(start, end)
+ seq, err := SequenceFromEncodedRange(start[0:n], end[0:m])
+ if err != nil {
+ return nil, err
+ }
+ rv = append(rv, seq)
+ rangeStack, r = rangeStack.Pop()
+ continue TOP
+ }
+ }
+
+ return rv, nil
+}
+
+// Sequence is a collection of *Range
+type Sequence []*Range
+
+// SequenceFromEncodedRange creates sequence from the encoded bytes
+func SequenceFromEncodedRange(start, end []byte) (Sequence, error) {
+ if len(start) != len(end) {
+ return nil, fmt.Errorf("byte slices must be the same length")
+ }
+ switch len(start) {
+ case 2:
+ return Sequence{
+ &Range{start[0], end[0]},
+ &Range{start[1], end[1]},
+ }, nil
+ case 3:
+ return Sequence{
+ &Range{start[0], end[0]},
+ &Range{start[1], end[1]},
+ &Range{start[2], end[2]},
+ }, nil
+ case 4:
+ return Sequence{
+ &Range{start[0], end[0]},
+ &Range{start[1], end[1]},
+ &Range{start[2], end[2]},
+ &Range{start[3], end[3]},
+ }, nil
+ }
+
+ return nil, fmt.Errorf("invalid encoded byte length")
+}
+
+// Matches checks to see if the provided byte slice matches the Sequence
+func (u Sequence) Matches(bytes []byte) bool {
+ if len(bytes) < len(u) {
+ return false
+ }
+ for i := 0; i < len(u); i++ {
+ if !u[i].matches(bytes[i]) {
+ return false
+ }
+ }
+ return true
+}
+
+func (u Sequence) String() string {
+ switch len(u) {
+ case 1:
+ return fmt.Sprintf("%v", u[0])
+ case 2:
+ return fmt.Sprintf("%v%v", u[0], u[1])
+ case 3:
+ return fmt.Sprintf("%v%v%v", u[0], u[1], u[2])
+ case 4:
+ return fmt.Sprintf("%v%v%v%v", u[0], u[1], u[2], u[3])
+ default:
+ return fmt.Sprintf("invalid utf8 sequence")
+ }
+}
+
+// Range describes a single range of byte values
+type Range struct {
+ Start byte
+ End byte
+}
+
+func (u Range) matches(b byte) bool {
+ if u.Start <= b && b <= u.End {
+ return true
+ }
+ return false
+}
+
+func (u Range) String() string {
+ if u.Start == u.End {
+ return fmt.Sprintf("[%X]", u.Start)
+ }
+ return fmt.Sprintf("[%X-%X]", u.Start, u.End)
+}
+
+type scalarRange struct {
+ start rune
+ end rune
+}
+
+func (s *scalarRange) String() string {
+ return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end)
+}
+
+// split this scalar range if it overlaps with a surrogate codepoint
+func (s *scalarRange) split() (*scalarRange, *scalarRange) {
+ if s.start < 0xe000 && s.end > 0xd7ff {
+ return &scalarRange{
+ start: s.start,
+ end: 0xd7ff,
+ },
+ &scalarRange{
+ start: 0xe000,
+ end: s.end,
+ }
+ }
+ return nil, nil
+}
+
+func (s *scalarRange) valid() bool {
+ return s.start <= s.end
+}
+
+func (s *scalarRange) ascii() *Range {
+ if s.valid() && s.end <= 0x7f {
+ return &Range{
+ Start: byte(s.start),
+ End: byte(s.end),
+ }
+ }
+ return nil
+}
+
+// start and end MUST have capacity for utf8.UTFMax bytes
+func (s *scalarRange) encode(start, end []byte) (int, int) {
+ n := utf8.EncodeRune(start, s.start)
+ m := utf8.EncodeRune(end, s.end)
+ return n, m
+}
+
+type rangeStack []*scalarRange
+
+func (s rangeStack) Push(v *scalarRange) rangeStack {
+ return append(s, v)
+}
+
+func (s rangeStack) Pop() (rangeStack, *scalarRange) {
+ l := len(s)
+ if l < 1 {
+ return s, nil
+ }
+ return s[:l-1], s[l-1]
+}
+
+func maxScalarValue(nbytes int) rune {
+ switch nbytes {
+ case 1:
+ return 0x007f
+ case 2:
+ return 0x07FF
+ case 3:
+ return 0xFFFF
+ default:
+ return 0x10FFFF
+ }
+}