You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

charclass.go 22KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854
  1. package syntax
  2. import (
  3. "bytes"
  4. "encoding/binary"
  5. "fmt"
  6. "sort"
  7. "unicode"
  8. "unicode/utf8"
  9. )
  10. // CharSet combines start-end rune ranges and unicode categories representing a set of characters
  11. type CharSet struct {
  12. ranges []singleRange
  13. categories []category
  14. sub *CharSet //optional subtractor
  15. negate bool
  16. anything bool
  17. }
  18. type category struct {
  19. negate bool
  20. cat string
  21. }
  22. type singleRange struct {
  23. first rune
  24. last rune
  25. }
  26. const (
  27. spaceCategoryText = " "
  28. wordCategoryText = "W"
  29. )
  30. var (
  31. ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
  32. ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
  33. ecmaDigit = []rune{0x0030, 0x003a}
  34. )
  35. var (
  36. AnyClass = getCharSetFromOldString([]rune{0}, false)
  37. ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
  38. NoneClass = getCharSetFromOldString(nil, false)
  39. ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
  40. NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
  41. ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
  42. NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
  43. ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
  44. NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
  45. WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
  46. NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
  47. SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
  48. NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
  49. DigitClass = getCharSetFromCategoryString(false, false, "Nd")
  50. NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
  51. )
  52. var unicodeCategories = func() map[string]*unicode.RangeTable {
  53. retVal := make(map[string]*unicode.RangeTable)
  54. for k, v := range unicode.Scripts {
  55. retVal[k] = v
  56. }
  57. for k, v := range unicode.Categories {
  58. retVal[k] = v
  59. }
  60. for k, v := range unicode.Properties {
  61. retVal[k] = v
  62. }
  63. return retVal
  64. }()
  65. func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
  66. if negateCat && negateSet {
  67. panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
  68. }
  69. c := CharSet{negate: negateSet}
  70. c.categories = make([]category, len(cats))
  71. for i, cat := range cats {
  72. c.categories[i] = category{cat: cat, negate: negateCat}
  73. }
  74. return func() *CharSet {
  75. //make a copy each time
  76. local := c
  77. //return that address
  78. return &local
  79. }
  80. }
  81. func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
  82. c := CharSet{}
  83. if len(setText) > 0 {
  84. fillFirst := false
  85. l := len(setText)
  86. if negate {
  87. if setText[0] == 0 {
  88. setText = setText[1:]
  89. } else {
  90. l++
  91. fillFirst = true
  92. }
  93. }
  94. if l%2 == 0 {
  95. c.ranges = make([]singleRange, l/2)
  96. } else {
  97. c.ranges = make([]singleRange, l/2+1)
  98. }
  99. first := true
  100. if fillFirst {
  101. c.ranges[0] = singleRange{first: 0}
  102. first = false
  103. }
  104. i := 0
  105. for _, r := range setText {
  106. if first {
  107. // lower bound in a new range
  108. c.ranges[i] = singleRange{first: r}
  109. first = false
  110. } else {
  111. c.ranges[i].last = r - 1
  112. i++
  113. first = true
  114. }
  115. }
  116. if !first {
  117. c.ranges[i].last = utf8.MaxRune
  118. }
  119. }
  120. return func() *CharSet {
  121. local := c
  122. return &local
  123. }
  124. }
  125. // Copy makes a deep copy to prevent accidental mutation of a set
  126. func (c CharSet) Copy() CharSet {
  127. ret := CharSet{
  128. anything: c.anything,
  129. negate: c.negate,
  130. }
  131. ret.ranges = append(ret.ranges, c.ranges...)
  132. ret.categories = append(ret.categories, c.categories...)
  133. if c.sub != nil {
  134. sub := c.sub.Copy()
  135. ret.sub = &sub
  136. }
  137. return ret
  138. }
  139. // gets a human-readable description for a set string
  140. func (c CharSet) String() string {
  141. buf := &bytes.Buffer{}
  142. buf.WriteRune('[')
  143. if c.IsNegated() {
  144. buf.WriteRune('^')
  145. }
  146. for _, r := range c.ranges {
  147. buf.WriteString(CharDescription(r.first))
  148. if r.first != r.last {
  149. if r.last-r.first != 1 {
  150. //groups that are 1 char apart skip the dash
  151. buf.WriteRune('-')
  152. }
  153. buf.WriteString(CharDescription(r.last))
  154. }
  155. }
  156. for _, c := range c.categories {
  157. buf.WriteString(c.String())
  158. }
  159. if c.sub != nil {
  160. buf.WriteRune('-')
  161. buf.WriteString(c.sub.String())
  162. }
  163. buf.WriteRune(']')
  164. return buf.String()
  165. }
  166. // mapHashFill converts a charset into a buffer for use in maps
  167. func (c CharSet) mapHashFill(buf *bytes.Buffer) {
  168. if c.negate {
  169. buf.WriteByte(0)
  170. } else {
  171. buf.WriteByte(1)
  172. }
  173. binary.Write(buf, binary.LittleEndian, len(c.ranges))
  174. binary.Write(buf, binary.LittleEndian, len(c.categories))
  175. for _, r := range c.ranges {
  176. buf.WriteRune(r.first)
  177. buf.WriteRune(r.last)
  178. }
  179. for _, ct := range c.categories {
  180. buf.WriteString(ct.cat)
  181. if ct.negate {
  182. buf.WriteByte(1)
  183. } else {
  184. buf.WriteByte(0)
  185. }
  186. }
  187. if c.sub != nil {
  188. c.sub.mapHashFill(buf)
  189. }
  190. }
  191. // CharIn returns true if the rune is in our character set (either ranges or categories).
  192. // It handles negations and subtracted sub-charsets.
  193. func (c CharSet) CharIn(ch rune) bool {
  194. val := false
  195. // in s && !s.subtracted
  196. //check ranges
  197. for _, r := range c.ranges {
  198. if ch < r.first {
  199. continue
  200. }
  201. if ch <= r.last {
  202. val = true
  203. break
  204. }
  205. }
  206. //check categories if we haven't already found a range
  207. if !val && len(c.categories) > 0 {
  208. for _, ct := range c.categories {
  209. // special categories...then unicode
  210. if ct.cat == spaceCategoryText {
  211. if unicode.IsSpace(ch) {
  212. // we found a space so we're done
  213. // negate means this is a "bad" thing
  214. val = !ct.negate
  215. break
  216. } else if ct.negate {
  217. val = true
  218. break
  219. }
  220. } else if ct.cat == wordCategoryText {
  221. if IsWordChar(ch) {
  222. val = !ct.negate
  223. break
  224. } else if ct.negate {
  225. val = true
  226. break
  227. }
  228. } else if unicode.Is(unicodeCategories[ct.cat], ch) {
  229. // if we're in this unicode category then we're done
  230. // if negate=true on this category then we "failed" our test
  231. // otherwise we're good that we found it
  232. val = !ct.negate
  233. break
  234. } else if ct.negate {
  235. val = true
  236. break
  237. }
  238. }
  239. }
  240. // negate the whole char set
  241. if c.negate {
  242. val = !val
  243. }
  244. // get subtracted recurse
  245. if val && c.sub != nil {
  246. val = !c.sub.CharIn(ch)
  247. }
  248. //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
  249. return val
  250. }
  251. func (c category) String() string {
  252. switch c.cat {
  253. case spaceCategoryText:
  254. if c.negate {
  255. return "\\S"
  256. }
  257. return "\\s"
  258. case wordCategoryText:
  259. if c.negate {
  260. return "\\W"
  261. }
  262. return "\\w"
  263. }
  264. if _, ok := unicodeCategories[c.cat]; ok {
  265. if c.negate {
  266. return "\\P{" + c.cat + "}"
  267. }
  268. return "\\p{" + c.cat + "}"
  269. }
  270. return "Unknown category: " + c.cat
  271. }
  272. // CharDescription Produces a human-readable description for a single character.
  273. func CharDescription(ch rune) string {
  274. /*if ch == '\\' {
  275. return "\\\\"
  276. }
  277. if ch > ' ' && ch <= '~' {
  278. return string(ch)
  279. } else if ch == '\n' {
  280. return "\\n"
  281. } else if ch == ' ' {
  282. return "\\ "
  283. }*/
  284. b := &bytes.Buffer{}
  285. escape(b, ch, false) //fmt.Sprintf("%U", ch)
  286. return b.String()
  287. }
  288. // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
  289. // RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
  290. // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
  291. // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
  292. func IsWordChar(r rune) bool {
  293. //"L", "Mn", "Nd", "Pc"
  294. return unicode.In(r,
  295. unicode.Categories["L"], unicode.Categories["Mn"],
  296. unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
  297. //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
  298. }
  299. func IsECMAWordChar(r rune) bool {
  300. return unicode.In(r,
  301. unicode.Categories["L"], unicode.Categories["Mn"],
  302. unicode.Categories["Nd"], unicode.Categories["Pc"])
  303. //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
  304. }
  305. // SingletonChar will return the char from the first range without validation.
  306. // It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
  307. func (c CharSet) SingletonChar() rune {
  308. return c.ranges[0].first
  309. }
  310. func (c CharSet) IsSingleton() bool {
  311. return !c.negate && //negated is multiple chars
  312. len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
  313. c.sub == nil && // subtraction means we've got multiple chars
  314. c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
  315. }
  316. func (c CharSet) IsSingletonInverse() bool {
  317. return c.negate && //same as above, but requires negated
  318. len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
  319. c.sub == nil && // subtraction means we've got multiple chars
  320. c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
  321. }
  322. func (c CharSet) IsMergeable() bool {
  323. return !c.IsNegated() && !c.HasSubtraction()
  324. }
  325. func (c CharSet) IsNegated() bool {
  326. return c.negate
  327. }
  328. func (c CharSet) HasSubtraction() bool {
  329. return c.sub != nil
  330. }
  331. func (c CharSet) IsEmpty() bool {
  332. return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
  333. }
  334. func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
  335. if ecma {
  336. if negate {
  337. c.addRanges(NotECMADigitClass().ranges)
  338. } else {
  339. c.addRanges(ECMADigitClass().ranges)
  340. }
  341. } else {
  342. c.addCategories(category{cat: "Nd", negate: negate})
  343. }
  344. }
  345. func (c *CharSet) addChar(ch rune) {
  346. c.addRange(ch, ch)
  347. }
  348. func (c *CharSet) addSpace(ecma, negate bool) {
  349. if ecma {
  350. if negate {
  351. c.addRanges(NotECMASpaceClass().ranges)
  352. } else {
  353. c.addRanges(ECMASpaceClass().ranges)
  354. }
  355. } else {
  356. c.addCategories(category{cat: spaceCategoryText, negate: negate})
  357. }
  358. }
  359. func (c *CharSet) addWord(ecma, negate bool) {
  360. if ecma {
  361. if negate {
  362. c.addRanges(NotECMAWordClass().ranges)
  363. } else {
  364. c.addRanges(ECMAWordClass().ranges)
  365. }
  366. } else {
  367. c.addCategories(category{cat: wordCategoryText, negate: negate})
  368. }
  369. }
  370. // Add set ranges and categories into ours -- no deduping or anything
  371. func (c *CharSet) addSet(set CharSet) {
  372. if c.anything {
  373. return
  374. }
  375. if set.anything {
  376. c.makeAnything()
  377. return
  378. }
  379. // just append here to prevent double-canon
  380. c.ranges = append(c.ranges, set.ranges...)
  381. c.addCategories(set.categories...)
  382. c.canonicalize()
  383. }
  384. func (c *CharSet) makeAnything() {
  385. c.anything = true
  386. c.categories = []category{}
  387. c.ranges = AnyClass().ranges
  388. }
  389. func (c *CharSet) addCategories(cats ...category) {
  390. // don't add dupes and remove positive+negative
  391. if c.anything {
  392. // if we've had a previous positive+negative group then
  393. // just return, we're as broad as we can get
  394. return
  395. }
  396. for _, ct := range cats {
  397. found := false
  398. for _, ct2 := range c.categories {
  399. if ct.cat == ct2.cat {
  400. if ct.negate != ct2.negate {
  401. // oposite negations...this mean we just
  402. // take us as anything and move on
  403. c.makeAnything()
  404. return
  405. }
  406. found = true
  407. break
  408. }
  409. }
  410. if !found {
  411. c.categories = append(c.categories, ct)
  412. }
  413. }
  414. }
  415. // Merges new ranges to our own
  416. func (c *CharSet) addRanges(ranges []singleRange) {
  417. if c.anything {
  418. return
  419. }
  420. c.ranges = append(c.ranges, ranges...)
  421. c.canonicalize()
  422. }
  423. // Merges everything but the new ranges into our own
  424. func (c *CharSet) addNegativeRanges(ranges []singleRange) {
  425. if c.anything {
  426. return
  427. }
  428. var hi rune
  429. // convert incoming ranges into opposites, assume they are in order
  430. for _, r := range ranges {
  431. if hi < r.first {
  432. c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
  433. }
  434. hi = r.last + 1
  435. }
  436. if hi < utf8.MaxRune {
  437. c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
  438. }
  439. c.canonicalize()
  440. }
  441. func isValidUnicodeCat(catName string) bool {
  442. _, ok := unicodeCategories[catName]
  443. return ok
  444. }
  445. func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
  446. if !isValidUnicodeCat(categoryName) {
  447. // unknown unicode category, script, or property "blah"
  448. panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
  449. }
  450. if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
  451. // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
  452. c.addCategories(
  453. category{cat: "Ll", negate: negate},
  454. category{cat: "Lu", negate: negate},
  455. category{cat: "Lt", negate: negate})
  456. }
  457. c.addCategories(category{cat: categoryName, negate: negate})
  458. }
  459. func (c *CharSet) addSubtraction(sub *CharSet) {
  460. c.sub = sub
  461. }
  462. func (c *CharSet) addRange(chMin, chMax rune) {
  463. c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
  464. c.canonicalize()
  465. }
  466. func (c *CharSet) addNamedASCII(name string, negate bool) bool {
  467. var rs []singleRange
  468. switch name {
  469. case "alnum":
  470. rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
  471. case "alpha":
  472. rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
  473. case "ascii":
  474. rs = []singleRange{singleRange{0, 0x7f}}
  475. case "blank":
  476. rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
  477. case "cntrl":
  478. rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
  479. case "digit":
  480. c.addDigit(false, negate, "")
  481. case "graph":
  482. rs = []singleRange{singleRange{'!', '~'}}
  483. case "lower":
  484. rs = []singleRange{singleRange{'a', 'z'}}
  485. case "print":
  486. rs = []singleRange{singleRange{' ', '~'}}
  487. case "punct": //[!-/:-@[-`{-~]
  488. rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
  489. case "space":
  490. c.addSpace(true, negate)
  491. case "upper":
  492. rs = []singleRange{singleRange{'A', 'Z'}}
  493. case "word":
  494. c.addWord(true, negate)
  495. case "xdigit":
  496. rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
  497. default:
  498. return false
  499. }
  500. if len(rs) > 0 {
  501. if negate {
  502. c.addNegativeRanges(rs)
  503. } else {
  504. c.addRanges(rs)
  505. }
  506. }
  507. return true
  508. }
  509. type singleRangeSorter []singleRange
  510. func (p singleRangeSorter) Len() int { return len(p) }
  511. func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
  512. func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  513. // Logic to reduce a character class to a unique, sorted form.
  514. func (c *CharSet) canonicalize() {
  515. var i, j int
  516. var last rune
  517. //
  518. // Find and eliminate overlapping or abutting ranges
  519. //
  520. if len(c.ranges) > 1 {
  521. sort.Sort(singleRangeSorter(c.ranges))
  522. done := false
  523. for i, j = 1, 0; ; i++ {
  524. for last = c.ranges[j].last; ; i++ {
  525. if i == len(c.ranges) || last == utf8.MaxRune {
  526. done = true
  527. break
  528. }
  529. CurrentRange := c.ranges[i]
  530. if CurrentRange.first > last+1 {
  531. break
  532. }
  533. if last < CurrentRange.last {
  534. last = CurrentRange.last
  535. }
  536. }
  537. c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
  538. j++
  539. if done {
  540. break
  541. }
  542. if j < i {
  543. c.ranges[j] = c.ranges[i]
  544. }
  545. }
  546. c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
  547. }
  548. }
  549. // Adds to the class any lowercase versions of characters already
  550. // in the class. Used for case-insensitivity.
  551. func (c *CharSet) addLowercase() {
  552. if c.anything {
  553. return
  554. }
  555. toAdd := []singleRange{}
  556. for i := 0; i < len(c.ranges); i++ {
  557. r := c.ranges[i]
  558. if r.first == r.last {
  559. lower := unicode.ToLower(r.first)
  560. c.ranges[i] = singleRange{first: lower, last: lower}
  561. } else {
  562. toAdd = append(toAdd, r)
  563. }
  564. }
  565. for _, r := range toAdd {
  566. c.addLowercaseRange(r.first, r.last)
  567. }
  568. c.canonicalize()
  569. }
  570. /**************************************************************************
  571. Let U be the set of Unicode character values and let L be the lowercase
  572. function, mapping from U to U. To perform case insensitive matching of
  573. character sets, we need to be able to map an interval I in U, say
  574. I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
  575. to a set A such that A contains L(I) and A is contained in the union of
  576. I and L(I).
  577. The table below partitions U into intervals on which L is non-decreasing.
  578. Thus, for any interval J = [a, b] contained in one of these intervals,
  579. L(J) is contained in [L(a), L(b)].
  580. It is also true that for any such J, [L(a), L(b)] is contained in the
  581. union of J and L(J). This does not follow from L being non-decreasing on
  582. these intervals. It follows from the nature of the L on each interval.
  583. On each interval, L has one of the following forms:
  584. (1) L(ch) = constant (LowercaseSet)
  585. (2) L(ch) = ch + offset (LowercaseAdd)
  586. (3) L(ch) = ch | 1 (LowercaseBor)
  587. (4) L(ch) = ch + (ch & 1) (LowercaseBad)
  588. It is easy to verify that for any of these forms [L(a), L(b)] is
  589. contained in the union of [a, b] and L([a, b]).
  590. ***************************************************************************/
  591. const (
  592. LowercaseSet = 0 // Set to arg.
  593. LowercaseAdd = 1 // Add arg.
  594. LowercaseBor = 2 // Bitwise or with 1.
  595. LowercaseBad = 3 // Bitwise and with 1 and add original.
  596. )
  597. type lcMap struct {
  598. chMin, chMax rune
  599. op, data int32
  600. }
  601. var lcTable = []lcMap{
  602. lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
  603. lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
  604. lcMap{'\u0100', '\u012E', LowercaseBor, 0},
  605. lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
  606. lcMap{'\u0132', '\u0136', LowercaseBor, 0},
  607. lcMap{'\u0139', '\u0147', LowercaseBad, 0},
  608. lcMap{'\u014A', '\u0176', LowercaseBor, 0},
  609. lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
  610. lcMap{'\u0179', '\u017D', LowercaseBad, 0},
  611. lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
  612. lcMap{'\u0182', '\u0184', LowercaseBor, 0},
  613. lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
  614. lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
  615. lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
  616. lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
  617. lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
  618. lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
  619. lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
  620. lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
  621. lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
  622. lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
  623. lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
  624. lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
  625. lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
  626. lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
  627. lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
  628. lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
  629. lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
  630. lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
  631. lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
  632. lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
  633. lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
  634. lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
  635. lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
  636. lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
  637. lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
  638. lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
  639. lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
  640. lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
  641. lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
  642. lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
  643. lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
  644. lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
  645. lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
  646. lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
  647. lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
  648. lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
  649. lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
  650. lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
  651. lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
  652. lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
  653. lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
  654. lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
  655. lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
  656. lcMap{'\u0460', '\u0480', LowercaseBor, 0},
  657. lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
  658. lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
  659. lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
  660. lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
  661. lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
  662. lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
  663. lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
  664. lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
  665. lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
  666. lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
  667. lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
  668. lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
  669. lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
  670. lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
  671. lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
  672. lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
  673. lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
  674. lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
  675. lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
  676. lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
  677. lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
  678. lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
  679. lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
  680. lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
  681. lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
  682. lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
  683. lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
  684. lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
  685. lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
  686. lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
  687. lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
  688. lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
  689. lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
  690. lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
  691. lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
  692. lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
  693. lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
  694. lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
  695. lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
  696. }
  697. func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
  698. var i, iMax, iMid int
  699. var chMinT, chMaxT rune
  700. var lc lcMap
  701. for i, iMax = 0, len(lcTable); i < iMax; {
  702. iMid = (i + iMax) / 2
  703. if lcTable[iMid].chMax < chMin {
  704. i = iMid + 1
  705. } else {
  706. iMax = iMid
  707. }
  708. }
  709. for ; i < len(lcTable); i++ {
  710. lc = lcTable[i]
  711. if lc.chMin > chMax {
  712. return
  713. }
  714. chMinT = lc.chMin
  715. if chMinT < chMin {
  716. chMinT = chMin
  717. }
  718. chMaxT = lc.chMax
  719. if chMaxT > chMax {
  720. chMaxT = chMax
  721. }
  722. switch lc.op {
  723. case LowercaseSet:
  724. chMinT = rune(lc.data)
  725. chMaxT = rune(lc.data)
  726. break
  727. case LowercaseAdd:
  728. chMinT += lc.data
  729. chMaxT += lc.data
  730. break
  731. case LowercaseBor:
  732. chMinT |= 1
  733. chMaxT |= 1
  734. break
  735. case LowercaseBad:
  736. chMinT += (chMinT & 1)
  737. chMaxT += (chMaxT & 1)
  738. break
  739. }
  740. if chMinT < chMin || chMaxT > chMax {
  741. c.addRange(chMinT, chMaxT)
  742. }
  743. }
  744. }