You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

translate.go 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
  1. // Copyright 2015 Huan Du. All rights reserved.
  2. // Licensed under the MIT license that can be found in the LICENSE file.
  3. package xstrings
  4. import (
  5. "bytes"
  6. "unicode"
  7. "unicode/utf8"
  8. )
  9. type runeRangeMap struct {
  10. FromLo rune // Lower bound of range map.
  11. FromHi rune // An inclusive higher bound of range map.
  12. ToLo rune
  13. ToHi rune
  14. }
  15. type runeDict struct {
  16. Dict [unicode.MaxASCII + 1]rune
  17. }
  18. type runeMap map[rune]rune
  19. // Translator can translate string with pre-compiled from and to patterns.
  20. // If a from/to pattern pair needs to be used more than once, it's recommended
  21. // to create a Translator and reuse it.
  22. type Translator struct {
  23. quickDict *runeDict // A quick dictionary to look up rune by index. Only available for latin runes.
  24. runeMap runeMap // Rune map for translation.
  25. ranges []*runeRangeMap // Ranges of runes.
  26. mappedRune rune // If mappedRune >= 0, all matched runes are translated to the mappedRune.
  27. reverted bool // If to pattern is empty, all matched characters will be deleted.
  28. hasPattern bool
  29. }
  30. // NewTranslator creates new Translator through a from/to pattern pair.
  31. func NewTranslator(from, to string) *Translator {
  32. tr := &Translator{}
  33. if from == "" {
  34. return tr
  35. }
  36. reverted := from[0] == '^'
  37. deletion := len(to) == 0
  38. if reverted {
  39. from = from[1:]
  40. }
  41. var fromStart, fromEnd, fromRangeStep rune
  42. var toStart, toEnd, toRangeStep rune
  43. var fromRangeSize, toRangeSize rune
  44. var singleRunes []rune
  45. // Update the to rune range.
  46. updateRange := func() {
  47. // No more rune to read in the to rune pattern.
  48. if toEnd == utf8.RuneError {
  49. return
  50. }
  51. if toRangeStep == 0 {
  52. to, toStart, toEnd, toRangeStep = nextRuneRange(to, toEnd)
  53. return
  54. }
  55. // Current range is not empty. Consume 1 rune from start.
  56. if toStart != toEnd {
  57. toStart += toRangeStep
  58. return
  59. }
  60. // No more rune. Repeat the last rune.
  61. if to == "" {
  62. toEnd = utf8.RuneError
  63. return
  64. }
  65. // Both start and end are used. Read two more runes from the to pattern.
  66. to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError)
  67. }
  68. if deletion {
  69. toStart = utf8.RuneError
  70. toEnd = utf8.RuneError
  71. } else {
  72. // If from pattern is reverted, only the last rune in the to pattern will be used.
  73. if reverted {
  74. var size int
  75. for len(to) > 0 {
  76. toStart, size = utf8.DecodeRuneInString(to)
  77. to = to[size:]
  78. }
  79. toEnd = utf8.RuneError
  80. } else {
  81. to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError)
  82. }
  83. }
  84. fromEnd = utf8.RuneError
  85. for len(from) > 0 {
  86. from, fromStart, fromEnd, fromRangeStep = nextRuneRange(from, fromEnd)
  87. // fromStart is a single character. Just map it with a rune in the to pattern.
  88. if fromRangeStep == 0 {
  89. singleRunes = tr.addRune(fromStart, toStart, singleRunes)
  90. updateRange()
  91. continue
  92. }
  93. for toEnd != utf8.RuneError && fromStart != fromEnd {
  94. // If mapped rune is a single character instead of a range, simply shift first
  95. // rune in the range.
  96. if toRangeStep == 0 {
  97. singleRunes = tr.addRune(fromStart, toStart, singleRunes)
  98. updateRange()
  99. fromStart += fromRangeStep
  100. continue
  101. }
  102. fromRangeSize = (fromEnd - fromStart) * fromRangeStep
  103. toRangeSize = (toEnd - toStart) * toRangeStep
  104. // Not enough runes in the to pattern. Need to read more.
  105. if fromRangeSize > toRangeSize {
  106. fromStart, toStart = tr.addRuneRange(fromStart, fromStart+toRangeSize*fromRangeStep, toStart, toEnd, singleRunes)
  107. fromStart += fromRangeStep
  108. updateRange()
  109. // Edge case: If fromRangeSize == toRangeSize + 1, the last fromStart value needs be considered
  110. // as a single rune.
  111. if fromStart == fromEnd {
  112. singleRunes = tr.addRune(fromStart, toStart, singleRunes)
  113. updateRange()
  114. }
  115. continue
  116. }
  117. fromStart, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart+fromRangeSize*toRangeStep, singleRunes)
  118. updateRange()
  119. break
  120. }
  121. if fromStart == fromEnd {
  122. fromEnd = utf8.RuneError
  123. continue
  124. }
  125. fromStart, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart, singleRunes)
  126. fromEnd = utf8.RuneError
  127. }
  128. if fromEnd != utf8.RuneError {
  129. singleRunes = tr.addRune(fromEnd, toStart, singleRunes)
  130. }
  131. tr.reverted = reverted
  132. tr.mappedRune = -1
  133. tr.hasPattern = true
  134. // Translate RuneError only if in deletion or reverted mode.
  135. if deletion || reverted {
  136. tr.mappedRune = toStart
  137. }
  138. return tr
  139. }
  140. func (tr *Translator) addRune(from, to rune, singleRunes []rune) []rune {
  141. if from <= unicode.MaxASCII {
  142. if tr.quickDict == nil {
  143. tr.quickDict = &runeDict{}
  144. }
  145. tr.quickDict.Dict[from] = to
  146. } else {
  147. if tr.runeMap == nil {
  148. tr.runeMap = make(runeMap)
  149. }
  150. tr.runeMap[from] = to
  151. }
  152. singleRunes = append(singleRunes, from)
  153. return singleRunes
  154. }
  155. func (tr *Translator) addRuneRange(fromLo, fromHi, toLo, toHi rune, singleRunes []rune) (rune, rune) {
  156. var r rune
  157. var rrm *runeRangeMap
  158. if fromLo < fromHi {
  159. rrm = &runeRangeMap{
  160. FromLo: fromLo,
  161. FromHi: fromHi,
  162. ToLo: toLo,
  163. ToHi: toHi,
  164. }
  165. } else {
  166. rrm = &runeRangeMap{
  167. FromLo: fromHi,
  168. FromHi: fromLo,
  169. ToLo: toHi,
  170. ToHi: toLo,
  171. }
  172. }
  173. // If there is any single rune conflicts with this rune range, clear single rune record.
  174. for _, r = range singleRunes {
  175. if rrm.FromLo <= r && r <= rrm.FromHi {
  176. if r <= unicode.MaxASCII {
  177. tr.quickDict.Dict[r] = 0
  178. } else {
  179. delete(tr.runeMap, r)
  180. }
  181. }
  182. }
  183. tr.ranges = append(tr.ranges, rrm)
  184. return fromHi, toHi
  185. }
  186. func nextRuneRange(str string, last rune) (remaining string, start, end rune, rangeStep rune) {
  187. var r rune
  188. var size int
  189. remaining = str
  190. escaping := false
  191. isRange := false
  192. for len(remaining) > 0 {
  193. r, size = utf8.DecodeRuneInString(remaining)
  194. remaining = remaining[size:]
  195. // Parse special characters.
  196. if !escaping {
  197. if r == '\\' {
  198. escaping = true
  199. continue
  200. }
  201. if r == '-' {
  202. // Ignore slash at beginning of string.
  203. if last == utf8.RuneError {
  204. continue
  205. }
  206. start = last
  207. isRange = true
  208. continue
  209. }
  210. }
  211. escaping = false
  212. if last != utf8.RuneError {
  213. // This is a range which start and end are the same.
  214. // Considier it as a normal character.
  215. if isRange && last == r {
  216. isRange = false
  217. continue
  218. }
  219. start = last
  220. end = r
  221. if isRange {
  222. if start < end {
  223. rangeStep = 1
  224. } else {
  225. rangeStep = -1
  226. }
  227. }
  228. return
  229. }
  230. last = r
  231. }
  232. start = last
  233. end = utf8.RuneError
  234. return
  235. }
  236. // Translate str with a from/to pattern pair.
  237. //
  238. // See comment in Translate function for usage and samples.
  239. func (tr *Translator) Translate(str string) string {
  240. if !tr.hasPattern || str == "" {
  241. return str
  242. }
  243. var r rune
  244. var size int
  245. var needTr bool
  246. orig := str
  247. var output *bytes.Buffer
  248. for len(str) > 0 {
  249. r, size = utf8.DecodeRuneInString(str)
  250. r, needTr = tr.TranslateRune(r)
  251. if needTr && output == nil {
  252. output = allocBuffer(orig, str)
  253. }
  254. if r != utf8.RuneError && output != nil {
  255. output.WriteRune(r)
  256. }
  257. str = str[size:]
  258. }
  259. // No character is translated.
  260. if output == nil {
  261. return orig
  262. }
  263. return output.String()
  264. }
  265. // TranslateRune return translated rune and true if r matches the from pattern.
  266. // If r doesn't match the pattern, original r is returned and translated is false.
  267. func (tr *Translator) TranslateRune(r rune) (result rune, translated bool) {
  268. switch {
  269. case tr.quickDict != nil:
  270. if r <= unicode.MaxASCII {
  271. result = tr.quickDict.Dict[r]
  272. if result != 0 {
  273. translated = true
  274. if tr.mappedRune >= 0 {
  275. result = tr.mappedRune
  276. }
  277. break
  278. }
  279. }
  280. fallthrough
  281. case tr.runeMap != nil:
  282. var ok bool
  283. if result, ok = tr.runeMap[r]; ok {
  284. translated = true
  285. if tr.mappedRune >= 0 {
  286. result = tr.mappedRune
  287. }
  288. break
  289. }
  290. fallthrough
  291. default:
  292. var rrm *runeRangeMap
  293. ranges := tr.ranges
  294. for i := len(ranges) - 1; i >= 0; i-- {
  295. rrm = ranges[i]
  296. if rrm.FromLo <= r && r <= rrm.FromHi {
  297. translated = true
  298. if tr.mappedRune >= 0 {
  299. result = tr.mappedRune
  300. break
  301. }
  302. if rrm.ToLo < rrm.ToHi {
  303. result = rrm.ToLo + r - rrm.FromLo
  304. } else if rrm.ToLo > rrm.ToHi {
  305. // ToHi can be smaller than ToLo if range is from higher to lower.
  306. result = rrm.ToLo - r + rrm.FromLo
  307. } else {
  308. result = rrm.ToLo
  309. }
  310. break
  311. }
  312. }
  313. }
  314. if tr.reverted {
  315. if !translated {
  316. result = tr.mappedRune
  317. }
  318. translated = !translated
  319. }
  320. if !translated {
  321. result = r
  322. }
  323. return
  324. }
  325. // HasPattern returns true if Translator has one pattern at least.
  326. func (tr *Translator) HasPattern() bool {
  327. return tr.hasPattern
  328. }
  329. // Translate str with the characters defined in from replaced by characters defined in to.
  330. //
  331. // From and to are patterns representing a set of characters. Pattern is defined as following.
  332. //
  333. // * Special characters
  334. // * '-' means a range of runes, e.g.
  335. // * "a-z" means all characters from 'a' to 'z' inclusive;
  336. // * "z-a" means all characters from 'z' to 'a' inclusive.
  337. // * '^' as first character means a set of all runes excepted listed, e.g.
  338. // * "^a-z" means all characters except 'a' to 'z' inclusive.
  339. // * '\' escapes special characters.
  340. // * Normal character represents itself, e.g. "abc" is a set including 'a', 'b' and 'c'.
  341. //
  342. // Translate will try to find a 1:1 mapping from from to to.
  343. // If to is smaller than from, last rune in to will be used to map "out of range" characters in from.
  344. //
  345. // Note that '^' only works in the from pattern. It will be considered as a normal character in the to pattern.
  346. //
  347. // If the to pattern is an empty string, Translate works exactly the same as Delete.
  348. //
  349. // Samples:
  350. // Translate("hello", "aeiou", "12345") => "h2ll4"
  351. // Translate("hello", "a-z", "A-Z") => "HELLO"
  352. // Translate("hello", "z-a", "a-z") => "svool"
  353. // Translate("hello", "aeiou", "*") => "h*ll*"
  354. // Translate("hello", "^l", "*") => "**ll*"
  355. // Translate("hello ^ world", `\^lo`, "*") => "he*** * w*r*d"
  356. func Translate(str, from, to string) string {
  357. tr := NewTranslator(from, to)
  358. return tr.Translate(str)
  359. }
  360. // Delete runes in str matching the pattern.
  361. // Pattern is defined in Translate function.
  362. //
  363. // Samples:
  364. // Delete("hello", "aeiou") => "hll"
  365. // Delete("hello", "a-k") => "llo"
  366. // Delete("hello", "^a-k") => "he"
  367. func Delete(str, pattern string) string {
  368. tr := NewTranslator(pattern, "")
  369. return tr.Translate(str)
  370. }
  371. // Count how many runes in str match the pattern.
  372. // Pattern is defined in Translate function.
  373. //
  374. // Samples:
  375. // Count("hello", "aeiou") => 3
  376. // Count("hello", "a-k") => 3
  377. // Count("hello", "^a-k") => 2
  378. func Count(str, pattern string) int {
  379. if pattern == "" || str == "" {
  380. return 0
  381. }
  382. var r rune
  383. var size int
  384. var matched bool
  385. tr := NewTranslator(pattern, "")
  386. cnt := 0
  387. for len(str) > 0 {
  388. r, size = utf8.DecodeRuneInString(str)
  389. str = str[size:]
  390. if _, matched = tr.TranslateRune(r); matched {
  391. cnt++
  392. }
  393. }
  394. return cnt
  395. }
  396. // Squeeze deletes adjacent repeated runes in str.
  397. // If pattern is not empty, only runes matching the pattern will be squeezed.
  398. //
  399. // Samples:
  400. // Squeeze("hello", "") => "helo"
  401. // Squeeze("hello", "m-z") => "hello"
  402. // Squeeze("hello world", " ") => "hello world"
  403. func Squeeze(str, pattern string) string {
  404. var last, r rune
  405. var size int
  406. var skipSqueeze, matched bool
  407. var tr *Translator
  408. var output *bytes.Buffer
  409. orig := str
  410. last = -1
  411. if len(pattern) > 0 {
  412. tr = NewTranslator(pattern, "")
  413. }
  414. for len(str) > 0 {
  415. r, size = utf8.DecodeRuneInString(str)
  416. // Need to squeeze the str.
  417. if last == r && !skipSqueeze {
  418. if tr != nil {
  419. if _, matched = tr.TranslateRune(r); !matched {
  420. skipSqueeze = true
  421. }
  422. }
  423. if output == nil {
  424. output = allocBuffer(orig, str)
  425. }
  426. if skipSqueeze {
  427. output.WriteRune(r)
  428. }
  429. } else {
  430. if output != nil {
  431. output.WriteRune(r)
  432. }
  433. last = r
  434. skipSqueeze = false
  435. }
  436. str = str[size:]
  437. }
  438. if output == nil {
  439. return orig
  440. }
  441. return output.String()
  442. }