You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

xurls.go 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. // Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
  2. // See LICENSE for licensing information
  3. // Package xurls extracts urls from plain text using regular expressions.
  4. package xurls
  5. import (
  6. "bytes"
  7. "regexp"
  8. )
  9. //go:generate go run generate/tldsgen/main.go
  10. //go:generate go run generate/schemesgen/main.go
  11. const (
  12. letter = `\p{L}`
  13. mark = `\p{M}`
  14. number = `\p{N}`
  15. iriChar = letter + mark + number
  16. currency = `\p{Sc}`
  17. otherSymb = `\p{So}`
  18. endChar = iriChar + `/\-+&~%=#` + currency + otherSymb
  19. otherPunc = `\p{Po}`
  20. midChar = endChar + "_*" + otherPunc
  21. wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)`
  22. wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]`
  23. wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}`
  24. wellAll = wellParen + `|` + wellBrack + `|` + wellBrace
  25. pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+`
  26. iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?`
  27. domain = `(` + iri + `\.)+`
  28. octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
  29. ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b`
  30. ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:`
  31. ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)`
  32. port = `(:[0-9]*)?`
  33. )
  34. // AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
  35. // scheme, and not just the known ones.
  36. var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
  37. // SchemesNoAuthority is a sorted list of some well-known url schemes that are
  38. // followed by ":" instead of "://".
  39. var SchemesNoAuthority = []string{
  40. `bitcoin`, // Bitcoin
  41. `file`, // Files
  42. `magnet`, // Torrent magnets
  43. `mailto`, // Mail
  44. `sms`, // SMS
  45. `tel`, // Telephone
  46. `xmpp`, // XMPP
  47. }
  48. func anyOf(strs ...string) string {
  49. var b bytes.Buffer
  50. b.WriteByte('(')
  51. for i, s := range strs {
  52. if i != 0 {
  53. b.WriteByte('|')
  54. }
  55. b.WriteString(regexp.QuoteMeta(s))
  56. }
  57. b.WriteByte(')')
  58. return b.String()
  59. }
  60. func strictExp() string {
  61. schemes := `(` + anyOf(Schemes...) + `://|` + anyOf(SchemesNoAuthority...) + `:)`
  62. return `(?i)` + schemes + `(?-i)` + pathCont
  63. }
  64. func relaxedExp() string {
  65. punycode := `xn--[a-z0-9-]+`
  66. knownTLDs := anyOf(append(TLDs, PseudoTLDs...)...)
  67. site := domain + `(?i)(` + punycode + `|` + knownTLDs + `)(?-i)`
  68. hostName := `(` + site + `|` + ipAddr + `)`
  69. webURL := hostName + port + `(/|/` + pathCont + `?|\b|(?m)$)`
  70. return strictExp() + `|` + webURL
  71. }
  72. // Strict produces a regexp that matches any URL with a scheme in either the
  73. // Schemes or SchemesNoAuthority lists.
  74. func Strict() *regexp.Regexp {
  75. re := regexp.MustCompile(strictExp())
  76. re.Longest()
  77. return re
  78. }
  79. // Relaxed produces a regexp that matches any URL matched by Strict, plus any
  80. // URL with no scheme.
  81. func Relaxed() *regexp.Regexp {
  82. re := regexp.MustCompile(relaxedExp())
  83. re.Longest()
  84. return re
  85. }
  86. // StrictMatchingScheme produces a regexp similar to Strict, but requiring that
  87. // the scheme match the given regular expression. See AnyScheme too.
  88. func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
  89. strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont
  90. re, err := regexp.Compile(strictMatching)
  91. if err != nil {
  92. return nil, err
  93. }
  94. re.Longest()
  95. return re, nil
  96. }