You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

regexp.go 9.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. /*
  2. Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a
  3. more feature full regex engine behind the scenes.
  4. It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET.
  5. You'll likely be better off with the RE2 engine from the regexp package and should only use this if you
  6. need to write very complex patterns or require compatibility with .NET.
  7. */
  8. package regexp2
  9. import (
  10. "errors"
  11. "math"
  12. "strconv"
  13. "sync"
  14. "time"
  15. "github.com/dlclark/regexp2/syntax"
  16. )
  17. // Default timeout used when running regexp matches -- "forever"
  18. var DefaultMatchTimeout = time.Duration(math.MaxInt64)
  19. // Regexp is the representation of a compiled regular expression.
  20. // A Regexp is safe for concurrent use by multiple goroutines.
  21. type Regexp struct {
  22. //timeout when trying to find matches
  23. MatchTimeout time.Duration
  24. // read-only after Compile
  25. pattern string // as passed to Compile
  26. options RegexOptions // options
  27. caps map[int]int // capnum->index
  28. capnames map[string]int //capture group name -> index
  29. capslist []string //sorted list of capture group names
  30. capsize int // size of the capture array
  31. code *syntax.Code // compiled program
  32. // cache of machines for running regexp
  33. muRun sync.Mutex
  34. runner []*runner
  35. }
  36. // Compile parses a regular expression and returns, if successful,
  37. // a Regexp object that can be used to match against text.
  38. func Compile(expr string, opt RegexOptions) (*Regexp, error) {
  39. // parse it
  40. tree, err := syntax.Parse(expr, syntax.RegexOptions(opt))
  41. if err != nil {
  42. return nil, err
  43. }
  44. // translate it to code
  45. code, err := syntax.Write(tree)
  46. if err != nil {
  47. return nil, err
  48. }
  49. // return it
  50. return &Regexp{
  51. pattern: expr,
  52. options: opt,
  53. caps: code.Caps,
  54. capnames: tree.Capnames,
  55. capslist: tree.Caplist,
  56. capsize: code.Capsize,
  57. code: code,
  58. MatchTimeout: DefaultMatchTimeout,
  59. }, nil
  60. }
  61. // MustCompile is like Compile but panics if the expression cannot be parsed.
  62. // It simplifies safe initialization of global variables holding compiled regular
  63. // expressions.
  64. func MustCompile(str string, opt RegexOptions) *Regexp {
  65. regexp, error := Compile(str, opt)
  66. if error != nil {
  67. panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error())
  68. }
  69. return regexp
  70. }
  71. // Escape adds backslashes to any special characters in the input string
  72. func Escape(input string) string {
  73. return syntax.Escape(input)
  74. }
  75. // Unescape removes any backslashes from previously-escaped special characters in the input string
  76. func Unescape(input string) (string, error) {
  77. return syntax.Unescape(input)
  78. }
  79. // String returns the source text used to compile the regular expression.
  80. func (re *Regexp) String() string {
  81. return re.pattern
  82. }
  83. func quote(s string) string {
  84. if strconv.CanBackquote(s) {
  85. return "`" + s + "`"
  86. }
  87. return strconv.Quote(s)
  88. }
  89. // RegexOptions impact the runtime and parsing behavior
  90. // for each specific regex. They are setable in code as well
  91. // as in the regex pattern itself.
  92. type RegexOptions int32
  93. const (
  94. None RegexOptions = 0x0
  95. IgnoreCase = 0x0001 // "i"
  96. Multiline = 0x0002 // "m"
  97. ExplicitCapture = 0x0004 // "n"
  98. Compiled = 0x0008 // "c"
  99. Singleline = 0x0010 // "s"
  100. IgnorePatternWhitespace = 0x0020 // "x"
  101. RightToLeft = 0x0040 // "r"
  102. Debug = 0x0080 // "d"
  103. ECMAScript = 0x0100 // "e"
  104. RE2 = 0x0200 // RE2 (regexp package) compatibility mode
  105. )
  106. func (re *Regexp) RightToLeft() bool {
  107. return re.options&RightToLeft != 0
  108. }
  109. func (re *Regexp) Debug() bool {
  110. return re.options&Debug != 0
  111. }
  112. // Replace searches the input string and replaces each match found with the replacement text.
  113. // Count will limit the number of matches attempted and startAt will allow
  114. // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
  115. // Set startAt and count to -1 to go through the whole string
  116. func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) {
  117. data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options))
  118. if err != nil {
  119. return "", err
  120. }
  121. //TODO: cache ReplacerData
  122. return replace(re, data, nil, input, startAt, count)
  123. }
  124. // ReplaceFunc searches the input string and replaces each match found using the string from the evaluator
  125. // Count will limit the number of matches attempted and startAt will allow
  126. // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
  127. // Set startAt and count to -1 to go through the whole string.
  128. func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) {
  129. return replace(re, nil, evaluator, input, startAt, count)
  130. }
  131. // FindStringMatch searches the input string for a Regexp match
  132. func (re *Regexp) FindStringMatch(s string) (*Match, error) {
  133. // convert string to runes
  134. return re.run(false, -1, getRunes(s))
  135. }
  136. // FindRunesMatch searches the input rune slice for a Regexp match
  137. func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) {
  138. return re.run(false, -1, r)
  139. }
  140. // FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index
  141. func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) {
  142. if startAt > len(s) {
  143. return nil, errors.New("startAt must be less than the length of the input string")
  144. }
  145. r, startAt := re.getRunesAndStart(s, startAt)
  146. if startAt == -1 {
  147. // we didn't find our start index in the string -- that's a problem
  148. return nil, errors.New("startAt must align to the start of a valid rune in the input string")
  149. }
  150. return re.run(false, startAt, r)
  151. }
  152. // FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index
  153. func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) {
  154. return re.run(false, startAt, r)
  155. }
  156. // FindNextMatch returns the next match in the same input string as the match parameter.
  157. // Will return nil if there is no next match or if given a nil match.
  158. func (re *Regexp) FindNextMatch(m *Match) (*Match, error) {
  159. if m == nil {
  160. return nil, nil
  161. }
  162. // If previous match was empty, advance by one before matching to prevent
  163. // infinite loop
  164. startAt := m.textpos
  165. if m.Length == 0 {
  166. if m.textpos == len(m.text) {
  167. return nil, nil
  168. }
  169. if re.RightToLeft() {
  170. startAt--
  171. } else {
  172. startAt++
  173. }
  174. }
  175. return re.run(false, startAt, m.text)
  176. }
  177. // MatchString return true if the string matches the regex
  178. // error will be set if a timeout occurs
  179. func (re *Regexp) MatchString(s string) (bool, error) {
  180. m, err := re.run(true, -1, getRunes(s))
  181. if err != nil {
  182. return false, err
  183. }
  184. return m != nil, nil
  185. }
  186. func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) {
  187. if startAt < 0 {
  188. if re.RightToLeft() {
  189. r := getRunes(s)
  190. return r, len(r)
  191. }
  192. return getRunes(s), 0
  193. }
  194. ret := make([]rune, len(s))
  195. i := 0
  196. runeIdx := -1
  197. for strIdx, r := range s {
  198. if strIdx == startAt {
  199. runeIdx = i
  200. }
  201. ret[i] = r
  202. i++
  203. }
  204. return ret[:i], runeIdx
  205. }
  206. func getRunes(s string) []rune {
  207. ret := make([]rune, len(s))
  208. i := 0
  209. for _, r := range s {
  210. ret[i] = r
  211. i++
  212. }
  213. return ret[:i]
  214. }
  215. // MatchRunes return true if the runes matches the regex
  216. // error will be set if a timeout occurs
  217. func (re *Regexp) MatchRunes(r []rune) (bool, error) {
  218. m, err := re.run(true, -1, r)
  219. if err != nil {
  220. return false, err
  221. }
  222. return m != nil, nil
  223. }
  224. // GetGroupNames Returns the set of strings used to name capturing groups in the expression.
  225. func (re *Regexp) GetGroupNames() []string {
  226. var result []string
  227. if re.capslist == nil {
  228. result = make([]string, re.capsize)
  229. for i := 0; i < len(result); i++ {
  230. result[i] = strconv.Itoa(i)
  231. }
  232. } else {
  233. result = make([]string, len(re.capslist))
  234. copy(result, re.capslist)
  235. }
  236. return result
  237. }
  238. // GetGroupNumbers returns the integer group numbers corresponding to a group name.
  239. func (re *Regexp) GetGroupNumbers() []int {
  240. var result []int
  241. if re.caps == nil {
  242. result = make([]int, re.capsize)
  243. for i := 0; i < len(result); i++ {
  244. result[i] = i
  245. }
  246. } else {
  247. result = make([]int, len(re.caps))
  248. for k, v := range re.caps {
  249. result[v] = k
  250. }
  251. }
  252. return result
  253. }
  254. // GroupNameFromNumber retrieves a group name that corresponds to a group number.
  255. // It will return "" for and unknown group number. Unnamed groups automatically
  256. // receive a name that is the decimal string equivalent of its number.
  257. func (re *Regexp) GroupNameFromNumber(i int) string {
  258. if re.capslist == nil {
  259. if i >= 0 && i < re.capsize {
  260. return strconv.Itoa(i)
  261. }
  262. return ""
  263. }
  264. if re.caps != nil {
  265. var ok bool
  266. if i, ok = re.caps[i]; !ok {
  267. return ""
  268. }
  269. }
  270. if i >= 0 && i < len(re.capslist) {
  271. return re.capslist[i]
  272. }
  273. return ""
  274. }
  275. // GroupNumberFromName returns a group number that corresponds to a group name.
  276. // Returns -1 if the name is not a recognized group name. Numbered groups
  277. // automatically get a group name that is the decimal string equivalent of its number.
  278. func (re *Regexp) GroupNumberFromName(name string) int {
  279. // look up name if we have a hashtable of names
  280. if re.capnames != nil {
  281. if k, ok := re.capnames[name]; ok {
  282. return k
  283. }
  284. return -1
  285. }
  286. // convert to an int if it looks like a number
  287. result := 0
  288. for i := 0; i < len(name); i++ {
  289. ch := name[i]
  290. if ch > '9' || ch < '0' {
  291. return -1
  292. }
  293. result *= 10
  294. result += int(ch - '0')
  295. }
  296. // return int if it's in range
  297. if result >= 0 && result < re.capsize {
  298. return result
  299. }
  300. return -1
  301. }