123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854 |
- package syntax
-
- import (
- "bytes"
- "encoding/binary"
- "fmt"
- "sort"
- "unicode"
- "unicode/utf8"
- )
-
- // CharSet combines start-end rune ranges and unicode categories representing a set of characters
- type CharSet struct {
- ranges []singleRange
- categories []category
- sub *CharSet //optional subtractor
- negate bool
- anything bool
- }
-
- type category struct {
- negate bool
- cat string
- }
-
- type singleRange struct {
- first rune
- last rune
- }
-
- const (
- spaceCategoryText = " "
- wordCategoryText = "W"
- )
-
- var (
- ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
- ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
- ecmaDigit = []rune{0x0030, 0x003a}
- )
-
- var (
- AnyClass = getCharSetFromOldString([]rune{0}, false)
- ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
- NoneClass = getCharSetFromOldString(nil, false)
- ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
- NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
- ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
- NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
- ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
- NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
-
- WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
- NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
- SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
- NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
- DigitClass = getCharSetFromCategoryString(false, false, "Nd")
- NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
- )
-
- var unicodeCategories = func() map[string]*unicode.RangeTable {
- retVal := make(map[string]*unicode.RangeTable)
- for k, v := range unicode.Scripts {
- retVal[k] = v
- }
- for k, v := range unicode.Categories {
- retVal[k] = v
- }
- for k, v := range unicode.Properties {
- retVal[k] = v
- }
- return retVal
- }()
-
- func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
- if negateCat && negateSet {
- panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
- }
-
- c := CharSet{negate: negateSet}
-
- c.categories = make([]category, len(cats))
- for i, cat := range cats {
- c.categories[i] = category{cat: cat, negate: negateCat}
- }
- return func() *CharSet {
- //make a copy each time
- local := c
- //return that address
- return &local
- }
- }
-
- func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
- c := CharSet{}
- if len(setText) > 0 {
- fillFirst := false
- l := len(setText)
- if negate {
- if setText[0] == 0 {
- setText = setText[1:]
- } else {
- l++
- fillFirst = true
- }
- }
-
- if l%2 == 0 {
- c.ranges = make([]singleRange, l/2)
- } else {
- c.ranges = make([]singleRange, l/2+1)
- }
-
- first := true
- if fillFirst {
- c.ranges[0] = singleRange{first: 0}
- first = false
- }
-
- i := 0
- for _, r := range setText {
- if first {
- // lower bound in a new range
- c.ranges[i] = singleRange{first: r}
- first = false
- } else {
- c.ranges[i].last = r - 1
- i++
- first = true
- }
- }
- if !first {
- c.ranges[i].last = utf8.MaxRune
- }
- }
-
- return func() *CharSet {
- local := c
- return &local
- }
- }
-
- // Copy makes a deep copy to prevent accidental mutation of a set
- func (c CharSet) Copy() CharSet {
- ret := CharSet{
- anything: c.anything,
- negate: c.negate,
- }
-
- ret.ranges = append(ret.ranges, c.ranges...)
- ret.categories = append(ret.categories, c.categories...)
-
- if c.sub != nil {
- sub := c.sub.Copy()
- ret.sub = &sub
- }
-
- return ret
- }
-
- // gets a human-readable description for a set string
- func (c CharSet) String() string {
- buf := &bytes.Buffer{}
- buf.WriteRune('[')
-
- if c.IsNegated() {
- buf.WriteRune('^')
- }
-
- for _, r := range c.ranges {
-
- buf.WriteString(CharDescription(r.first))
- if r.first != r.last {
- if r.last-r.first != 1 {
- //groups that are 1 char apart skip the dash
- buf.WriteRune('-')
- }
- buf.WriteString(CharDescription(r.last))
- }
- }
-
- for _, c := range c.categories {
- buf.WriteString(c.String())
- }
-
- if c.sub != nil {
- buf.WriteRune('-')
- buf.WriteString(c.sub.String())
- }
-
- buf.WriteRune(']')
-
- return buf.String()
- }
-
- // mapHashFill converts a charset into a buffer for use in maps
- func (c CharSet) mapHashFill(buf *bytes.Buffer) {
- if c.negate {
- buf.WriteByte(0)
- } else {
- buf.WriteByte(1)
- }
-
- binary.Write(buf, binary.LittleEndian, len(c.ranges))
- binary.Write(buf, binary.LittleEndian, len(c.categories))
- for _, r := range c.ranges {
- buf.WriteRune(r.first)
- buf.WriteRune(r.last)
- }
- for _, ct := range c.categories {
- buf.WriteString(ct.cat)
- if ct.negate {
- buf.WriteByte(1)
- } else {
- buf.WriteByte(0)
- }
- }
-
- if c.sub != nil {
- c.sub.mapHashFill(buf)
- }
- }
-
- // CharIn returns true if the rune is in our character set (either ranges or categories).
- // It handles negations and subtracted sub-charsets.
- func (c CharSet) CharIn(ch rune) bool {
- val := false
- // in s && !s.subtracted
-
- //check ranges
- for _, r := range c.ranges {
- if ch < r.first {
- continue
- }
- if ch <= r.last {
- val = true
- break
- }
- }
-
- //check categories if we haven't already found a range
- if !val && len(c.categories) > 0 {
- for _, ct := range c.categories {
- // special categories...then unicode
- if ct.cat == spaceCategoryText {
- if unicode.IsSpace(ch) {
- // we found a space so we're done
- // negate means this is a "bad" thing
- val = !ct.negate
- break
- } else if ct.negate {
- val = true
- break
- }
- } else if ct.cat == wordCategoryText {
- if IsWordChar(ch) {
- val = !ct.negate
- break
- } else if ct.negate {
- val = true
- break
- }
- } else if unicode.Is(unicodeCategories[ct.cat], ch) {
- // if we're in this unicode category then we're done
- // if negate=true on this category then we "failed" our test
- // otherwise we're good that we found it
- val = !ct.negate
- break
- } else if ct.negate {
- val = true
- break
- }
- }
- }
-
- // negate the whole char set
- if c.negate {
- val = !val
- }
-
- // get subtracted recurse
- if val && c.sub != nil {
- val = !c.sub.CharIn(ch)
- }
-
- //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
- return val
- }
-
- func (c category) String() string {
- switch c.cat {
- case spaceCategoryText:
- if c.negate {
- return "\\S"
- }
- return "\\s"
- case wordCategoryText:
- if c.negate {
- return "\\W"
- }
- return "\\w"
- }
- if _, ok := unicodeCategories[c.cat]; ok {
-
- if c.negate {
- return "\\P{" + c.cat + "}"
- }
- return "\\p{" + c.cat + "}"
- }
- return "Unknown category: " + c.cat
- }
-
- // CharDescription Produces a human-readable description for a single character.
- func CharDescription(ch rune) string {
- /*if ch == '\\' {
- return "\\\\"
- }
-
- if ch > ' ' && ch <= '~' {
- return string(ch)
- } else if ch == '\n' {
- return "\\n"
- } else if ch == ' ' {
- return "\\ "
- }*/
-
- b := &bytes.Buffer{}
- escape(b, ch, false) //fmt.Sprintf("%U", ch)
- return b.String()
- }
-
- // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
- // RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
- // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
- // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
- func IsWordChar(r rune) bool {
- //"L", "Mn", "Nd", "Pc"
- return unicode.In(r,
- unicode.Categories["L"], unicode.Categories["Mn"],
- unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
- //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
- }
-
- func IsECMAWordChar(r rune) bool {
- return unicode.In(r,
- unicode.Categories["L"], unicode.Categories["Mn"],
- unicode.Categories["Nd"], unicode.Categories["Pc"])
-
- //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
- }
-
- // SingletonChar will return the char from the first range without validation.
- // It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
- func (c CharSet) SingletonChar() rune {
- return c.ranges[0].first
- }
-
- func (c CharSet) IsSingleton() bool {
- return !c.negate && //negated is multiple chars
- len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
- c.sub == nil && // subtraction means we've got multiple chars
- c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
- }
-
- func (c CharSet) IsSingletonInverse() bool {
- return c.negate && //same as above, but requires negated
- len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
- c.sub == nil && // subtraction means we've got multiple chars
- c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
- }
-
- func (c CharSet) IsMergeable() bool {
- return !c.IsNegated() && !c.HasSubtraction()
- }
-
- func (c CharSet) IsNegated() bool {
- return c.negate
- }
-
- func (c CharSet) HasSubtraction() bool {
- return c.sub != nil
- }
-
- func (c CharSet) IsEmpty() bool {
- return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
- }
-
- func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
- if ecma {
- if negate {
- c.addRanges(NotECMADigitClass().ranges)
- } else {
- c.addRanges(ECMADigitClass().ranges)
- }
- } else {
- c.addCategories(category{cat: "Nd", negate: negate})
- }
- }
-
- func (c *CharSet) addChar(ch rune) {
- c.addRange(ch, ch)
- }
-
- func (c *CharSet) addSpace(ecma, negate bool) {
- if ecma {
- if negate {
- c.addRanges(NotECMASpaceClass().ranges)
- } else {
- c.addRanges(ECMASpaceClass().ranges)
- }
- } else {
- c.addCategories(category{cat: spaceCategoryText, negate: negate})
- }
- }
-
- func (c *CharSet) addWord(ecma, negate bool) {
- if ecma {
- if negate {
- c.addRanges(NotECMAWordClass().ranges)
- } else {
- c.addRanges(ECMAWordClass().ranges)
- }
- } else {
- c.addCategories(category{cat: wordCategoryText, negate: negate})
- }
- }
-
- // Add set ranges and categories into ours -- no deduping or anything
- func (c *CharSet) addSet(set CharSet) {
- if c.anything {
- return
- }
- if set.anything {
- c.makeAnything()
- return
- }
- // just append here to prevent double-canon
- c.ranges = append(c.ranges, set.ranges...)
- c.addCategories(set.categories...)
- c.canonicalize()
- }
-
- func (c *CharSet) makeAnything() {
- c.anything = true
- c.categories = []category{}
- c.ranges = AnyClass().ranges
- }
-
- func (c *CharSet) addCategories(cats ...category) {
- // don't add dupes and remove positive+negative
- if c.anything {
- // if we've had a previous positive+negative group then
- // just return, we're as broad as we can get
- return
- }
-
- for _, ct := range cats {
- found := false
- for _, ct2 := range c.categories {
- if ct.cat == ct2.cat {
- if ct.negate != ct2.negate {
- // oposite negations...this mean we just
- // take us as anything and move on
- c.makeAnything()
- return
- }
- found = true
- break
- }
- }
-
- if !found {
- c.categories = append(c.categories, ct)
- }
- }
- }
-
- // Merges new ranges to our own
- func (c *CharSet) addRanges(ranges []singleRange) {
- if c.anything {
- return
- }
- c.ranges = append(c.ranges, ranges...)
- c.canonicalize()
- }
-
- // Merges everything but the new ranges into our own
- func (c *CharSet) addNegativeRanges(ranges []singleRange) {
- if c.anything {
- return
- }
-
- var hi rune
-
- // convert incoming ranges into opposites, assume they are in order
- for _, r := range ranges {
- if hi < r.first {
- c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
- }
- hi = r.last + 1
- }
-
- if hi < utf8.MaxRune {
- c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
- }
-
- c.canonicalize()
- }
-
- func isValidUnicodeCat(catName string) bool {
- _, ok := unicodeCategories[catName]
- return ok
- }
-
- func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
- if !isValidUnicodeCat(categoryName) {
- // unknown unicode category, script, or property "blah"
- panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
-
- }
-
- if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
- // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
- c.addCategories(
- category{cat: "Ll", negate: negate},
- category{cat: "Lu", negate: negate},
- category{cat: "Lt", negate: negate})
- }
- c.addCategories(category{cat: categoryName, negate: negate})
- }
-
- func (c *CharSet) addSubtraction(sub *CharSet) {
- c.sub = sub
- }
-
- func (c *CharSet) addRange(chMin, chMax rune) {
- c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
- c.canonicalize()
- }
-
- func (c *CharSet) addNamedASCII(name string, negate bool) bool {
- var rs []singleRange
-
- switch name {
- case "alnum":
- rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
- case "alpha":
- rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
- case "ascii":
- rs = []singleRange{singleRange{0, 0x7f}}
- case "blank":
- rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
- case "cntrl":
- rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
- case "digit":
- c.addDigit(false, negate, "")
- case "graph":
- rs = []singleRange{singleRange{'!', '~'}}
- case "lower":
- rs = []singleRange{singleRange{'a', 'z'}}
- case "print":
- rs = []singleRange{singleRange{' ', '~'}}
- case "punct": //[!-/:-@[-`{-~]
- rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
- case "space":
- c.addSpace(true, negate)
- case "upper":
- rs = []singleRange{singleRange{'A', 'Z'}}
- case "word":
- c.addWord(true, negate)
- case "xdigit":
- rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
- default:
- return false
- }
-
- if len(rs) > 0 {
- if negate {
- c.addNegativeRanges(rs)
- } else {
- c.addRanges(rs)
- }
- }
-
- return true
- }
-
- type singleRangeSorter []singleRange
-
- func (p singleRangeSorter) Len() int { return len(p) }
- func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
- func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
-
- // Logic to reduce a character class to a unique, sorted form.
- func (c *CharSet) canonicalize() {
- var i, j int
- var last rune
-
- //
- // Find and eliminate overlapping or abutting ranges
- //
-
- if len(c.ranges) > 1 {
- sort.Sort(singleRangeSorter(c.ranges))
-
- done := false
-
- for i, j = 1, 0; ; i++ {
- for last = c.ranges[j].last; ; i++ {
- if i == len(c.ranges) || last == utf8.MaxRune {
- done = true
- break
- }
-
- CurrentRange := c.ranges[i]
- if CurrentRange.first > last+1 {
- break
- }
-
- if last < CurrentRange.last {
- last = CurrentRange.last
- }
- }
-
- c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
-
- j++
-
- if done {
- break
- }
-
- if j < i {
- c.ranges[j] = c.ranges[i]
- }
- }
-
- c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
- }
- }
-
- // Adds to the class any lowercase versions of characters already
- // in the class. Used for case-insensitivity.
- func (c *CharSet) addLowercase() {
- if c.anything {
- return
- }
- toAdd := []singleRange{}
- for i := 0; i < len(c.ranges); i++ {
- r := c.ranges[i]
- if r.first == r.last {
- lower := unicode.ToLower(r.first)
- c.ranges[i] = singleRange{first: lower, last: lower}
- } else {
- toAdd = append(toAdd, r)
- }
- }
-
- for _, r := range toAdd {
- c.addLowercaseRange(r.first, r.last)
- }
- c.canonicalize()
- }
-
- /**************************************************************************
- Let U be the set of Unicode character values and let L be the lowercase
- function, mapping from U to U. To perform case insensitive matching of
- character sets, we need to be able to map an interval I in U, say
-
- I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
-
- to a set A such that A contains L(I) and A is contained in the union of
- I and L(I).
-
- The table below partitions U into intervals on which L is non-decreasing.
- Thus, for any interval J = [a, b] contained in one of these intervals,
- L(J) is contained in [L(a), L(b)].
-
- It is also true that for any such J, [L(a), L(b)] is contained in the
- union of J and L(J). This does not follow from L being non-decreasing on
- these intervals. It follows from the nature of the L on each interval.
- On each interval, L has one of the following forms:
-
- (1) L(ch) = constant (LowercaseSet)
- (2) L(ch) = ch + offset (LowercaseAdd)
- (3) L(ch) = ch | 1 (LowercaseBor)
- (4) L(ch) = ch + (ch & 1) (LowercaseBad)
-
- It is easy to verify that for any of these forms [L(a), L(b)] is
- contained in the union of [a, b] and L([a, b]).
- ***************************************************************************/
-
- const (
- LowercaseSet = 0 // Set to arg.
- LowercaseAdd = 1 // Add arg.
- LowercaseBor = 2 // Bitwise or with 1.
- LowercaseBad = 3 // Bitwise and with 1 and add original.
- )
-
- type lcMap struct {
- chMin, chMax rune
- op, data int32
- }
-
- var lcTable = []lcMap{
- lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
- lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
- lcMap{'\u0100', '\u012E', LowercaseBor, 0},
- lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
- lcMap{'\u0132', '\u0136', LowercaseBor, 0},
- lcMap{'\u0139', '\u0147', LowercaseBad, 0},
- lcMap{'\u014A', '\u0176', LowercaseBor, 0},
- lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
- lcMap{'\u0179', '\u017D', LowercaseBad, 0},
- lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
- lcMap{'\u0182', '\u0184', LowercaseBor, 0},
- lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
- lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
- lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
- lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
- lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
- lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
- lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
- lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
- lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
- lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
- lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
- lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
- lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
- lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
- lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
- lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
- lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
- lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
- lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
- lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
- lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
- lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
- lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
- lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
- lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
- lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
- lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
- lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
- lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
- lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
- lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
- lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
- lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
- lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
- lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
- lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
- lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
- lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
- lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
- lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
- lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
- lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
- lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
- lcMap{'\u0460', '\u0480', LowercaseBor, 0},
- lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
- lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
- lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
- lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
- lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
- lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
- lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
- lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
- lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
- lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
- lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
- lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
- lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
- lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
- lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
- lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
- lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
- lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
- lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
- lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
- lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
- lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
- lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
- lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
- lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
- lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
- lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
- lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
- lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
- lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
- lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
- lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
- lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
- lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
- lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
- lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
- lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
- lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
- lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
- }
-
- func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
- var i, iMax, iMid int
- var chMinT, chMaxT rune
- var lc lcMap
-
- for i, iMax = 0, len(lcTable); i < iMax; {
- iMid = (i + iMax) / 2
- if lcTable[iMid].chMax < chMin {
- i = iMid + 1
- } else {
- iMax = iMid
- }
- }
-
- for ; i < len(lcTable); i++ {
- lc = lcTable[i]
- if lc.chMin > chMax {
- return
- }
- chMinT = lc.chMin
- if chMinT < chMin {
- chMinT = chMin
- }
-
- chMaxT = lc.chMax
- if chMaxT > chMax {
- chMaxT = chMax
- }
-
- switch lc.op {
- case LowercaseSet:
- chMinT = rune(lc.data)
- chMaxT = rune(lc.data)
- break
- case LowercaseAdd:
- chMinT += lc.data
- chMaxT += lc.data
- break
- case LowercaseBor:
- chMinT |= 1
- chMaxT |= 1
- break
- case LowercaseBad:
- chMinT += (chMinT & 1)
- chMaxT += (chMaxT & 1)
- break
- }
-
- if chMinT < chMin || chMaxT > chMax {
- c.addRange(chMinT, chMaxT)
- }
- }
- }
|