You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

xurls.go 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. // Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
  2. // See LICENSE for licensing information
  3. // Package xurls extracts urls from plain text using regular expressions.
  4. package xurls
  5. import (
  6. "bytes"
  7. "regexp"
  8. )
  9. //go:generate go run generate/tldsgen/main.go
  10. //go:generate go run generate/schemesgen/main.go
  11. const (
  12. letter = `\p{L}`
  13. mark = `\p{M}`
  14. number = `\p{N}`
  15. iriChar = letter + mark + number
  16. currency = `\p{Sc}`
  17. otherSymb = `\p{So}`
  18. endChar = iriChar + `/\-+_&~*%=#` + currency + otherSymb
  19. otherPunc = `\p{Po}`
  20. midChar = endChar + `|` + otherPunc
  21. wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)`
  22. wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]`
  23. wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}`
  24. wellAll = wellParen + `|` + wellBrack + `|` + wellBrace
  25. pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+`
  26. iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?`
  27. domain = `(` + iri + `\.)+`
  28. octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
  29. ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b`
  30. ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:`
  31. ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)`
  32. port = `(:[0-9]*)?`
  33. )
  34. // AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
  35. // scheme, and not just the known ones.
  36. var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
  37. // SchemesNoAuthority is a sorted list of some well-known url schemes that are
  38. // followed by ":" instead of "://".
  39. var SchemesNoAuthority = []string{
  40. `bitcoin`, // Bitcoin
  41. `file`, // Files
  42. `magnet`, // Torrent magnets
  43. `mailto`, // Mail
  44. `sms`, // SMS
  45. `tel`, // Telephone
  46. `xmpp`, // XMPP
  47. }
  48. func anyOf(strs ...string) string {
  49. var b bytes.Buffer
  50. b.WriteByte('(')
  51. for i, s := range strs {
  52. if i != 0 {
  53. b.WriteByte('|')
  54. }
  55. b.WriteString(regexp.QuoteMeta(s))
  56. }
  57. b.WriteByte(')')
  58. return b.String()
  59. }
  60. func strictExp() string {
  61. schemes := `(` + anyOf(Schemes...) + `://|` + anyOf(SchemesNoAuthority...) + `:)`
  62. return `(?i)` + schemes + `(?-i)` + pathCont
  63. }
  64. func relaxedExp() string {
  65. site := domain + `(?i)` + anyOf(append(TLDs, PseudoTLDs...)...) + `(?-i)`
  66. hostName := `(` + site + `|` + ipAddr + `)`
  67. webURL := hostName + port + `(/|/` + pathCont + `?|\b|$)`
  68. return strictExp() + `|` + webURL
  69. }
  70. // Strict produces a regexp that matches any URL with a scheme in either the
  71. // Schemes or SchemesNoAuthority lists.
  72. func Strict() *regexp.Regexp {
  73. re := regexp.MustCompile(strictExp())
  74. re.Longest()
  75. return re
  76. }
  77. // Relaxed produces a regexp that matches any URL matched by Strict, plus any
  78. // URL with no scheme.
  79. func Relaxed() *regexp.Regexp {
  80. re := regexp.MustCompile(relaxedExp())
  81. re.Longest()
  82. return re
  83. }
  84. // StrictMatchingScheme produces a regexp similar to Strict, but requiring that
  85. // the scheme match the given regular expression. See AnyScheme too.
  86. func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
  87. strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont
  88. re, err := regexp.Compile(strictMatching)
  89. if err != nil {
  90. return nil, err
  91. }
  92. re.Longest()
  93. return re, nil
  94. }