You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

regex.go 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655
  1. package rubex
  2. /*
  3. #cgo CFLAGS: -I/usr/local/include
  4. #cgo LDFLAGS: -L/usr/local/lib -lonig
  5. #include <stdlib.h>
  6. #include <oniguruma.h>
  7. #include "chelper.h"
  8. */
  9. import "C"
  10. import (
  11. "bytes"
  12. "errors"
  13. "fmt"
  14. "io"
  15. "runtime"
  16. "strconv"
  17. "sync"
  18. "unicode/utf8"
  19. "unsafe"
  20. )
  21. const numMatchStartSize = 4
  22. const numReadBufferStartSize = 256
  23. var mutex sync.Mutex
  24. type NamedGroupInfo map[string]int
  25. type Regexp struct {
  26. pattern string
  27. regex C.OnigRegex
  28. encoding C.OnigEncoding
  29. errorInfo *C.OnigErrorInfo
  30. errorBuf *C.char
  31. numCaptures int32
  32. namedGroupInfo NamedGroupInfo
  33. }
  34. // NewRegexp creates and initializes a new Regexp with the given pattern and option.
  35. func NewRegexp(pattern string, option int) (*Regexp, error) {
  36. return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option)
  37. }
  38. // NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII.
  39. func NewRegexpASCII(pattern string, option int) (*Regexp, error) {
  40. return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option)
  41. }
  42. func initRegexp(re *Regexp, option int) (*Regexp, error) {
  43. patternCharPtr := C.CString(re.pattern)
  44. defer C.free(unsafe.Pointer(patternCharPtr))
  45. mutex.Lock()
  46. defer mutex.Unlock()
  47. errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.encoding, &re.errorInfo, &re.errorBuf)
  48. if errorCode != C.ONIG_NORMAL {
  49. return re, errors.New(C.GoString(re.errorBuf))
  50. }
  51. re.numCaptures = int32(C.onig_number_of_captures(re.regex)) + 1
  52. re.namedGroupInfo = re.getNamedGroupInfo()
  53. runtime.SetFinalizer(re, (*Regexp).Free)
  54. return re, nil
  55. }
  56. func Compile(str string) (*Regexp, error) {
  57. return NewRegexp(str, ONIG_OPTION_DEFAULT)
  58. }
  59. func MustCompile(str string) *Regexp {
  60. regexp, error := NewRegexp(str, ONIG_OPTION_DEFAULT)
  61. if error != nil {
  62. panic("regexp: compiling " + str + ": " + error.Error())
  63. }
  64. return regexp
  65. }
  66. func CompileWithOption(str string, option int) (*Regexp, error) {
  67. return NewRegexp(str, option)
  68. }
  69. func MustCompileWithOption(str string, option int) *Regexp {
  70. regexp, error := NewRegexp(str, option)
  71. if error != nil {
  72. panic("regexp: compiling " + str + ": " + error.Error())
  73. }
  74. return regexp
  75. }
  76. // MustCompileASCII is equivalent to MustCompile, but with the encoding restricted to ASCII.
  77. func MustCompileASCII(str string) *Regexp {
  78. regexp, error := NewRegexpASCII(str, ONIG_OPTION_DEFAULT)
  79. if error != nil {
  80. panic("regexp: compiling " + str + ": " + error.Error())
  81. }
  82. return regexp
  83. }
  84. func (re *Regexp) Free() {
  85. mutex.Lock()
  86. if re.regex != nil {
  87. C.onig_free(re.regex)
  88. re.regex = nil
  89. }
  90. mutex.Unlock()
  91. if re.errorInfo != nil {
  92. C.free(unsafe.Pointer(re.errorInfo))
  93. re.errorInfo = nil
  94. }
  95. if re.errorBuf != nil {
  96. C.free(unsafe.Pointer(re.errorBuf))
  97. re.errorBuf = nil
  98. }
  99. }
  100. func (re *Regexp) getNamedGroupInfo() NamedGroupInfo {
  101. numNamedGroups := int(C.onig_number_of_names(re.regex))
  102. // when any named capture exists, there is no numbered capture even if
  103. // there are unnamed captures.
  104. if numNamedGroups == 0 {
  105. return nil
  106. }
  107. namedGroupInfo := make(map[string]int)
  108. //try to get the names
  109. bufferSize := len(re.pattern) * 2
  110. nameBuffer := make([]byte, bufferSize)
  111. groupNumbers := make([]int32, numNamedGroups)
  112. bufferPtr := unsafe.Pointer(&nameBuffer[0])
  113. numbersPtr := unsafe.Pointer(&groupNumbers[0])
  114. length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr)))
  115. if length == 0 {
  116. panic(fmt.Errorf("could not get the capture group names from %q", re.String()))
  117. }
  118. namesAsBytes := bytes.Split(nameBuffer[:length], ([]byte)(";"))
  119. if len(namesAsBytes) != numNamedGroups {
  120. panic(fmt.Errorf(
  121. "the number of named groups (%d) does not match the number names found (%d)",
  122. numNamedGroups, len(namesAsBytes),
  123. ))
  124. }
  125. for i, nameAsBytes := range namesAsBytes {
  126. name := string(nameAsBytes)
  127. namedGroupInfo[name] = int(groupNumbers[i])
  128. }
  129. return namedGroupInfo
  130. }
  131. func (re *Regexp) find(b []byte, n int, offset int) []int {
  132. match := make([]int, re.numCaptures*2)
  133. if n == 0 {
  134. b = []byte{0}
  135. }
  136. bytesPtr := unsafe.Pointer(&b[0])
  137. // captures contains two pairs of ints, start and end, so we need list
  138. // twice the size of the capture groups.
  139. captures := make([]C.int, re.numCaptures*2)
  140. capturesPtr := unsafe.Pointer(&captures[0])
  141. var numCaptures int32
  142. numCapturesPtr := unsafe.Pointer(&numCaptures)
  143. pos := int(C.SearchOnigRegex(
  144. bytesPtr, C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT),
  145. re.regex, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr),
  146. ))
  147. if pos < 0 {
  148. return nil
  149. }
  150. if numCaptures <= 0 {
  151. panic("cannot have 0 captures when processing a match")
  152. }
  153. if re.numCaptures != numCaptures {
  154. panic(fmt.Errorf("expected %d captures but got %d", re.numCaptures, numCaptures))
  155. }
  156. for i := range captures {
  157. match[i] = int(captures[i])
  158. }
  159. return match
  160. }
  161. func getCapture(b []byte, beg int, end int) []byte {
  162. if beg < 0 || end < 0 {
  163. return nil
  164. }
  165. return b[beg:end]
  166. }
  167. func (re *Regexp) match(b []byte, n int, offset int) bool {
  168. if n == 0 {
  169. b = []byte{0}
  170. }
  171. bytesPtr := unsafe.Pointer(&b[0])
  172. pos := int(C.SearchOnigRegex(
  173. bytesPtr, C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT),
  174. re.regex, re.errorInfo, nil, nil, nil,
  175. ))
  176. return pos >= 0
  177. }
  178. func (re *Regexp) findAll(b []byte, n int) [][]int {
  179. if n < 0 {
  180. n = len(b)
  181. }
  182. capture := make([][]int, 0, numMatchStartSize)
  183. var offset int
  184. for offset <= n {
  185. match := re.find(b, n, offset)
  186. if match == nil {
  187. break
  188. }
  189. capture = append(capture, match)
  190. // move offset to the ending index of the current match and prepare to
  191. // find the next non-overlapping match.
  192. offset = match[1]
  193. // if match[0] == match[1], it means the current match does not advance
  194. // the search. we need to exit the loop to avoid getting stuck here.
  195. if match[0] == match[1] {
  196. if offset < n && offset >= 0 {
  197. //there are more bytes, so move offset by a word
  198. _, width := utf8.DecodeRune(b[offset:])
  199. offset += width
  200. } else {
  201. //search is over, exit loop
  202. break
  203. }
  204. }
  205. }
  206. return capture
  207. }
  208. func (re *Regexp) FindIndex(b []byte) []int {
  209. match := re.find(b, len(b), 0)
  210. if len(match) == 0 {
  211. return nil
  212. }
  213. return match[:2]
  214. }
  215. func (re *Regexp) Find(b []byte) []byte {
  216. loc := re.FindIndex(b)
  217. if loc == nil {
  218. return nil
  219. }
  220. return getCapture(b, loc[0], loc[1])
  221. }
  222. func (re *Regexp) FindString(s string) string {
  223. mb := re.Find([]byte(s))
  224. if mb == nil {
  225. return ""
  226. }
  227. return string(mb)
  228. }
  229. func (re *Regexp) FindStringIndex(s string) []int {
  230. return re.FindIndex([]byte(s))
  231. }
  232. func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
  233. matches := re.findAll(b, n)
  234. if len(matches) == 0 {
  235. return nil
  236. }
  237. return matches
  238. }
  239. func (re *Regexp) FindAll(b []byte, n int) [][]byte {
  240. matches := re.FindAllIndex(b, n)
  241. if matches == nil {
  242. return nil
  243. }
  244. matchBytes := make([][]byte, 0, len(matches))
  245. for _, match := range matches {
  246. matchBytes = append(matchBytes, getCapture(b, match[0], match[1]))
  247. }
  248. return matchBytes
  249. }
  250. func (re *Regexp) FindAllString(s string, n int) []string {
  251. b := []byte(s)
  252. matches := re.FindAllIndex(b, n)
  253. if matches == nil {
  254. return nil
  255. }
  256. matchStrings := make([]string, 0, len(matches))
  257. for _, match := range matches {
  258. m := getCapture(b, match[0], match[1])
  259. if m == nil {
  260. matchStrings = append(matchStrings, "")
  261. } else {
  262. matchStrings = append(matchStrings, string(m))
  263. }
  264. }
  265. return matchStrings
  266. }
  267. func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
  268. return re.FindAllIndex([]byte(s), n)
  269. }
  270. func (re *Regexp) FindSubmatchIndex(b []byte) []int {
  271. match := re.find(b, len(b), 0)
  272. if len(match) == 0 {
  273. return nil
  274. }
  275. return match
  276. }
  277. func (re *Regexp) FindSubmatch(b []byte) [][]byte {
  278. match := re.FindSubmatchIndex(b)
  279. if match == nil {
  280. return nil
  281. }
  282. length := len(match) / 2
  283. if length == 0 {
  284. return nil
  285. }
  286. results := make([][]byte, 0, length)
  287. for i := 0; i < length; i++ {
  288. results = append(results, getCapture(b, match[2*i], match[2*i+1]))
  289. }
  290. return results
  291. }
  292. func (re *Regexp) FindStringSubmatch(s string) []string {
  293. b := []byte(s)
  294. match := re.FindSubmatchIndex(b)
  295. if match == nil {
  296. return nil
  297. }
  298. length := len(match) / 2
  299. if length == 0 {
  300. return nil
  301. }
  302. results := make([]string, 0, length)
  303. for i := 0; i < length; i++ {
  304. cap := getCapture(b, match[2*i], match[2*i+1])
  305. if cap == nil {
  306. results = append(results, "")
  307. } else {
  308. results = append(results, string(cap))
  309. }
  310. }
  311. return results
  312. }
  313. func (re *Regexp) FindStringSubmatchIndex(s string) []int {
  314. return re.FindSubmatchIndex([]byte(s))
  315. }
  316. func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
  317. matches := re.findAll(b, n)
  318. if len(matches) == 0 {
  319. return nil
  320. }
  321. return matches
  322. }
  323. func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
  324. matches := re.findAll(b, n)
  325. if len(matches) == 0 {
  326. return nil
  327. }
  328. allCapturedBytes := make([][][]byte, 0, len(matches))
  329. for _, match := range matches {
  330. length := len(match) / 2
  331. capturedBytes := make([][]byte, 0, length)
  332. for i := 0; i < length; i++ {
  333. capturedBytes = append(capturedBytes, getCapture(b, match[2*i], match[2*i+1]))
  334. }
  335. allCapturedBytes = append(allCapturedBytes, capturedBytes)
  336. }
  337. return allCapturedBytes
  338. }
  339. func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
  340. b := []byte(s)
  341. matches := re.findAll(b, n)
  342. if len(matches) == 0 {
  343. return nil
  344. }
  345. allCapturedStrings := make([][]string, 0, len(matches))
  346. for _, match := range matches {
  347. length := len(match) / 2
  348. capturedStrings := make([]string, 0, length)
  349. for i := 0; i < length; i++ {
  350. cap := getCapture(b, match[2*i], match[2*i+1])
  351. if cap == nil {
  352. capturedStrings = append(capturedStrings, "")
  353. } else {
  354. capturedStrings = append(capturedStrings, string(cap))
  355. }
  356. }
  357. allCapturedStrings = append(allCapturedStrings, capturedStrings)
  358. }
  359. return allCapturedStrings
  360. }
  361. func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
  362. return re.FindAllSubmatchIndex([]byte(s), n)
  363. }
  364. func (re *Regexp) Match(b []byte) bool {
  365. return re.match(b, len(b), 0)
  366. }
  367. func (re *Regexp) MatchString(s string) bool {
  368. return re.Match([]byte(s))
  369. }
  370. func (re *Regexp) NumSubexp() int {
  371. return (int)(C.onig_number_of_captures(re.regex))
  372. }
  373. func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte) []byte {
  374. replLen := len(repl)
  375. newRepl := make([]byte, 0, replLen*3)
  376. groupName := make([]byte, 0, replLen)
  377. var inGroupNameMode, inEscapeMode bool
  378. for index := 0; index < replLen; index++ {
  379. ch := repl[index]
  380. if inGroupNameMode && ch == byte('<') {
  381. } else if inGroupNameMode && ch == byte('>') {
  382. inGroupNameMode = false
  383. capBytes := capturedBytes[string(groupName)]
  384. newRepl = append(newRepl, capBytes...)
  385. groupName = groupName[:0] //reset the name
  386. } else if inGroupNameMode {
  387. groupName = append(groupName, ch)
  388. } else if inEscapeMode && ch <= byte('9') && byte('1') <= ch {
  389. capNumStr := string(ch)
  390. capBytes := capturedBytes[capNumStr]
  391. newRepl = append(newRepl, capBytes...)
  392. } else if inEscapeMode && ch == byte('k') && (index+1) < replLen && repl[index+1] == byte('<') {
  393. inGroupNameMode = true
  394. inEscapeMode = false
  395. index++ //bypass the next char '<'
  396. } else if inEscapeMode {
  397. newRepl = append(newRepl, '\\')
  398. newRepl = append(newRepl, ch)
  399. } else if ch != '\\' {
  400. newRepl = append(newRepl, ch)
  401. }
  402. if ch == byte('\\') || inEscapeMode {
  403. inEscapeMode = !inEscapeMode
  404. }
  405. }
  406. return newRepl
  407. }
  408. func (re *Regexp) replaceAll(src, repl []byte, replFunc func([]byte, []byte, map[string][]byte) []byte) []byte {
  409. srcLen := len(src)
  410. matches := re.findAll(src, srcLen)
  411. if len(matches) == 0 {
  412. return src
  413. }
  414. dest := make([]byte, 0, srcLen)
  415. for i, match := range matches {
  416. length := len(match) / 2
  417. capturedBytes := make(map[string][]byte)
  418. if re.namedGroupInfo == nil {
  419. for j := 0; j < length; j++ {
  420. capturedBytes[strconv.Itoa(j)] = getCapture(src, match[2*j], match[2*j+1])
  421. }
  422. } else {
  423. for name, j := range re.namedGroupInfo {
  424. capturedBytes[name] = getCapture(src, match[2*j], match[2*j+1])
  425. }
  426. }
  427. matchBytes := getCapture(src, match[0], match[1])
  428. newRepl := replFunc(repl, matchBytes, capturedBytes)
  429. prevEnd := 0
  430. if i > 0 {
  431. prevMatch := matches[i-1][:2]
  432. prevEnd = prevMatch[1]
  433. }
  434. if match[0] > prevEnd && prevEnd >= 0 && match[0] <= srcLen {
  435. dest = append(dest, src[prevEnd:match[0]]...)
  436. }
  437. dest = append(dest, newRepl...)
  438. }
  439. lastEnd := matches[len(matches)-1][1]
  440. if lastEnd < srcLen && lastEnd >= 0 {
  441. dest = append(dest, src[lastEnd:]...)
  442. }
  443. return dest
  444. }
  445. func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
  446. return re.replaceAll(src, repl, fillCapturedValues)
  447. }
  448. func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
  449. return re.replaceAll(src, nil, func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte {
  450. return repl(matchBytes)
  451. })
  452. }
  453. func (re *Regexp) ReplaceAllString(src, repl string) string {
  454. return string(re.ReplaceAll([]byte(src), []byte(repl)))
  455. }
  456. func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
  457. return string(re.replaceAll([]byte(src), nil, func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte {
  458. return []byte(repl(string(matchBytes)))
  459. }))
  460. }
  461. func (re *Regexp) String() string {
  462. return re.pattern
  463. }
  464. func growBuffer(b []byte, offset int, n int) []byte {
  465. if offset+n > cap(b) {
  466. buf := make([]byte, 2*cap(b)+n)
  467. copy(buf, b[:offset])
  468. return buf
  469. }
  470. return b
  471. }
  472. func fromReader(r io.RuneReader) []byte {
  473. b := make([]byte, numReadBufferStartSize)
  474. var offset int
  475. for {
  476. rune, runeWidth, err := r.ReadRune()
  477. if err != nil {
  478. break
  479. }
  480. b = growBuffer(b, offset, runeWidth)
  481. writeWidth := utf8.EncodeRune(b[offset:], rune)
  482. if runeWidth != writeWidth {
  483. panic("reading rune width not equal to the written rune width")
  484. }
  485. offset += writeWidth
  486. }
  487. return b[:offset]
  488. }
  489. func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
  490. b := fromReader(r)
  491. return re.FindIndex(b)
  492. }
  493. func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
  494. b := fromReader(r)
  495. return re.FindSubmatchIndex(b)
  496. }
  497. func (re *Regexp) MatchReader(r io.RuneReader) bool {
  498. b := fromReader(r)
  499. return re.Match(b)
  500. }
  501. func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
  502. //no easy way to implement this
  503. return "", false
  504. }
  505. func MatchString(pattern string, s string) (matched bool, error error) {
  506. re, err := Compile(pattern)
  507. if err != nil {
  508. return false, err
  509. }
  510. return re.MatchString(s), nil
  511. }
  512. func (re *Regexp) Gsub(src, repl string) string {
  513. return string(re.replaceAll([]byte(src), []byte(repl), fillCapturedValues))
  514. }
  515. func (re *Regexp) GsubFunc(src string, replFunc func(string, map[string]string) string) string {
  516. replaced := re.replaceAll([]byte(src), nil,
  517. func(_ []byte, matchBytes []byte, capturedBytes map[string][]byte) []byte {
  518. capturedStrings := make(map[string]string)
  519. for name, capBytes := range capturedBytes {
  520. capturedStrings[name] = string(capBytes)
  521. }
  522. matchString := string(matchBytes)
  523. return ([]byte)(replFunc(matchString, capturedStrings))
  524. },
  525. )
  526. return string(replaced)
  527. }