You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

url.c 60KB


  1. /*
  2. * Copyright (c) 2009-2015, Vsevolod Stakhov
  3. * Copyright (C) 2002-2015 Igor Sysoev
  4. * Copyright (C) 2011-2015 Nginx, Inc.
  5. * All rights reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions are met:
  9. * * Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. * * Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
  16. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  17. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
  19. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  21. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  22. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  24. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include "config.h"
  27. #include "url.h"
  28. #include "util.h"
  29. #include "fstring.h"
  30. #include "main.h"
  31. #include "message.h"
  32. #include "trie.h"
  33. #include "http.h"
  34. typedef struct url_match_s {
  35. const gchar *m_begin;
  36. gsize m_len;
  37. const gchar *pattern;
  38. const gchar *prefix;
  39. gboolean add_prefix;
  40. } url_match_t;
  41. #define URL_FLAG_NOHTML (1 << 0)
  42. #define URL_FLAG_STRICT_MATCH (1 << 1)
  43. #define URL_FLAG_STAR_MATCH (1 << 2)
  44. struct url_matcher {
  45. gchar *pattern;
  46. const gchar *prefix;
  47. gboolean (*start)(const gchar *begin, const gchar *end, const gchar *pos,
  48. url_match_t *match);
  49. gboolean (*end)(const gchar *begin, const gchar *end, const gchar *pos,
  50. url_match_t *match);
  51. gint flags;
  52. };
  53. static gboolean url_file_start (const gchar *begin,
  54. const gchar *end,
  55. const gchar *pos,
  56. url_match_t *match);
  57. static gboolean url_file_end (const gchar *begin,
  58. const gchar *end,
  59. const gchar *pos,
  60. url_match_t *match);
  61. static gboolean url_web_start (const gchar *begin,
  62. const gchar *end,
  63. const gchar *pos,
  64. url_match_t *match);
  65. static gboolean url_web_end (const gchar *begin,
  66. const gchar *end,
  67. const gchar *pos,
  68. url_match_t *match);
  69. static gboolean url_tld_start (const gchar *begin,
  70. const gchar *end,
  71. const gchar *pos,
  72. url_match_t *match);
  73. static gboolean url_tld_end (const gchar *begin,
  74. const gchar *end,
  75. const gchar *pos,
  76. url_match_t *match);
  77. static gboolean url_email_start (const gchar *begin,
  78. const gchar *end,
  79. const gchar *pos,
  80. url_match_t *match);
  81. static gboolean url_email_end (const gchar *begin,
  82. const gchar *end,
  83. const gchar *pos,
  84. url_match_t *match);
  85. struct url_matcher static_matchers[] = {
  86. /* Common prefixes */
  87. { "file://", "", url_file_start, url_file_end,
  88. 0 },
  89. { "ftp://", "", url_web_start, url_web_end,
  90. 0 },
  91. { "sftp://", "", url_web_start, url_web_end,
  92. 0 },
  93. { "http://", "", url_web_start, url_web_end,
  94. 0 },
  95. { "https://", "", url_web_start, url_web_end,
  96. 0 },
  97. { "news://", "", url_web_start, url_web_end,
  98. 0 },
  99. { "nntp://", "", url_web_start, url_web_end,
  100. 0 },
  101. { "telnet://", "", url_web_start, url_web_end,
  102. 0 },
  103. { "webcal://", "", url_web_start, url_web_end,
  104. 0 },
  105. { "mailto:", "", url_email_start, url_email_end,
  106. 0 },
  107. { "callto://", "", url_web_start, url_web_end,
  108. 0 },
  109. { "h323:", "", url_web_start, url_web_end,
  110. 0 },
  111. { "sip:", "", url_web_start, url_web_end,
  112. 0 },
  113. { "www.", "http://", url_web_start, url_web_end,
  114. 0 },
  115. { "ftp.", "ftp://", url_web_start, url_web_end,
  116. URL_FLAG_NOHTML },
  117. /* TLD domains parts */
  118. { ".ac", "http://", url_tld_start, url_tld_end,
  119. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  120. { ".ad", "http://", url_tld_start, url_tld_end,
  121. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  122. { ".ae", "http://", url_tld_start, url_tld_end,
  123. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  124. { ".aero", "http://", url_tld_start, url_tld_end,
  125. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  126. { ".af", "http://", url_tld_start, url_tld_end,
  127. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  128. { ".ag", "http://", url_tld_start, url_tld_end,
  129. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  130. { ".ai", "http://", url_tld_start, url_tld_end,
  131. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  132. { ".al", "http://", url_tld_start, url_tld_end,
  133. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  134. { ".am", "http://", url_tld_start, url_tld_end,
  135. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  136. { ".an", "http://", url_tld_start, url_tld_end,
  137. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  138. { ".ao", "http://", url_tld_start, url_tld_end,
  139. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  140. { ".aq", "http://", url_tld_start, url_tld_end,
  141. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  142. { ".ar", "http://", url_tld_start, url_tld_end,
  143. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  144. { ".arpa", "http://", url_tld_start, url_tld_end,
  145. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  146. { ".as", "http://", url_tld_start, url_tld_end,
  147. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  148. { ".asia", "http://", url_tld_start, url_tld_end,
  149. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  150. { ".at", "http://", url_tld_start, url_tld_end,
  151. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  152. { ".au", "http://", url_tld_start, url_tld_end,
  153. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  154. { ".aw", "http://", url_tld_start, url_tld_end,
  155. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  156. { ".ax", "http://", url_tld_start, url_tld_end,
  157. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  158. { ".az", "http://", url_tld_start, url_tld_end,
  159. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  160. { ".ba", "http://", url_tld_start, url_tld_end,
  161. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  162. { ".bb", "http://", url_tld_start, url_tld_end,
  163. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  164. { ".bd", "http://", url_tld_start, url_tld_end,
  165. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  166. { ".be", "http://", url_tld_start, url_tld_end,
  167. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  168. { ".bf", "http://", url_tld_start, url_tld_end,
  169. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  170. { ".bg", "http://", url_tld_start, url_tld_end,
  171. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  172. { ".bh", "http://", url_tld_start, url_tld_end,
  173. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  174. { ".bi", "http://", url_tld_start, url_tld_end,
  175. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  176. { ".biz", "http://", url_tld_start, url_tld_end,
  177. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  178. { ".bj", "http://", url_tld_start, url_tld_end,
  179. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  180. { ".bm", "http://", url_tld_start, url_tld_end,
  181. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  182. { ".bn", "http://", url_tld_start, url_tld_end,
  183. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  184. { ".bo", "http://", url_tld_start, url_tld_end,
  185. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  186. { ".br", "http://", url_tld_start, url_tld_end,
  187. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  188. { ".bs", "http://", url_tld_start, url_tld_end,
  189. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  190. { ".bt", "http://", url_tld_start, url_tld_end,
  191. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  192. { ".bv", "http://", url_tld_start, url_tld_end,
  193. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  194. { ".bw", "http://", url_tld_start, url_tld_end,
  195. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  196. { ".by", "http://", url_tld_start, url_tld_end,
  197. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  198. { ".bz", "http://", url_tld_start, url_tld_end,
  199. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  200. { ".ca", "http://", url_tld_start, url_tld_end,
  201. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  202. { ".cat", "http://", url_tld_start, url_tld_end,
  203. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  204. { ".cc", "http://", url_tld_start, url_tld_end,
  205. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  206. { ".cd", "http://", url_tld_start, url_tld_end,
  207. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  208. { ".cf", "http://", url_tld_start, url_tld_end,
  209. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  210. { ".cg", "http://", url_tld_start, url_tld_end,
  211. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  212. { ".ch", "http://", url_tld_start, url_tld_end,
  213. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  214. { ".ci", "http://", url_tld_start, url_tld_end,
  215. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  216. { ".ck", "http://", url_tld_start, url_tld_end,
  217. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  218. { ".cl", "http://", url_tld_start, url_tld_end,
  219. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  220. { ".cm", "http://", url_tld_start, url_tld_end,
  221. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  222. { ".cn", "http://", url_tld_start, url_tld_end,
  223. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  224. { ".co", "http://", url_tld_start, url_tld_end,
  225. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  226. { ".com", "http://", url_tld_start, url_tld_end,
  227. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  228. { ".coop", "http://", url_tld_start, url_tld_end,
  229. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  230. { ".cr", "http://", url_tld_start, url_tld_end,
  231. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  232. { ".cu", "http://", url_tld_start, url_tld_end,
  233. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  234. { ".cv", "http://", url_tld_start, url_tld_end,
  235. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  236. { ".cw", "http://", url_tld_start, url_tld_end,
  237. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  238. { ".cx", "http://", url_tld_start, url_tld_end,
  239. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  240. { ".cy", "http://", url_tld_start, url_tld_end,
  241. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  242. { ".cz", "http://", url_tld_start, url_tld_end,
  243. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  244. { ".de", "http://", url_tld_start, url_tld_end,
  245. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  246. { ".dj", "http://", url_tld_start, url_tld_end,
  247. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  248. { ".dk", "http://", url_tld_start, url_tld_end,
  249. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  250. { ".dm", "http://", url_tld_start, url_tld_end,
  251. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  252. { ".do", "http://", url_tld_start, url_tld_end,
  253. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  254. { ".dz", "http://", url_tld_start, url_tld_end,
  255. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  256. { ".ec", "http://", url_tld_start, url_tld_end,
  257. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  258. { ".edu", "http://", url_tld_start, url_tld_end,
  259. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  260. { ".ee", "http://", url_tld_start, url_tld_end,
  261. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  262. { ".eg", "http://", url_tld_start, url_tld_end,
  263. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  264. { ".er", "http://", url_tld_start, url_tld_end,
  265. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  266. { ".es", "http://", url_tld_start, url_tld_end,
  267. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  268. { ".et", "http://", url_tld_start, url_tld_end,
  269. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  270. { ".eu", "http://", url_tld_start, url_tld_end,
  271. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  272. { ".fi", "http://", url_tld_start, url_tld_end,
  273. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  274. { ".fj", "http://", url_tld_start, url_tld_end,
  275. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  276. { ".fk", "http://", url_tld_start, url_tld_end,
  277. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  278. { ".fm", "http://", url_tld_start, url_tld_end,
  279. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  280. { ".fo", "http://", url_tld_start, url_tld_end,
  281. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  282. { ".fr", "http://", url_tld_start, url_tld_end,
  283. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  284. { ".ga", "http://", url_tld_start, url_tld_end,
  285. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  286. { ".gb", "http://", url_tld_start, url_tld_end,
  287. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  288. { ".gd", "http://", url_tld_start, url_tld_end,
  289. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  290. { ".ge", "http://", url_tld_start, url_tld_end,
  291. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  292. { ".gf", "http://", url_tld_start, url_tld_end,
  293. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  294. { ".gg", "http://", url_tld_start, url_tld_end,
  295. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  296. { ".gh", "http://", url_tld_start, url_tld_end,
  297. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  298. { ".gi", "http://", url_tld_start, url_tld_end,
  299. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  300. { ".gl", "http://", url_tld_start, url_tld_end,
  301. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  302. { ".gm", "http://", url_tld_start, url_tld_end,
  303. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  304. { ".gn", "http://", url_tld_start, url_tld_end,
  305. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  306. { ".gov", "http://", url_tld_start, url_tld_end,
  307. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  308. { ".gp", "http://", url_tld_start, url_tld_end,
  309. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  310. { ".gq", "http://", url_tld_start, url_tld_end,
  311. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  312. { ".gr", "http://", url_tld_start, url_tld_end,
  313. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  314. { ".gs", "http://", url_tld_start, url_tld_end,
  315. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  316. { ".gt", "http://", url_tld_start, url_tld_end,
  317. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  318. { ".gu", "http://", url_tld_start, url_tld_end,
  319. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  320. { ".gw", "http://", url_tld_start, url_tld_end,
  321. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  322. { ".gy", "http://", url_tld_start, url_tld_end,
  323. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  324. { ".hk", "http://", url_tld_start, url_tld_end,
  325. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  326. { ".hm", "http://", url_tld_start, url_tld_end,
  327. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  328. { ".hn", "http://", url_tld_start, url_tld_end,
  329. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  330. { ".hr", "http://", url_tld_start, url_tld_end,
  331. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  332. { ".ht", "http://", url_tld_start, url_tld_end,
  333. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  334. { ".hu", "http://", url_tld_start, url_tld_end,
  335. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  336. { ".id", "http://", url_tld_start, url_tld_end,
  337. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  338. { ".ie", "http://", url_tld_start, url_tld_end,
  339. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  340. { ".il", "http://", url_tld_start, url_tld_end,
  341. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  342. { ".im", "http://", url_tld_start, url_tld_end,
  343. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  344. { ".in", "http://", url_tld_start, url_tld_end,
  345. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  346. { ".info", "http://", url_tld_start, url_tld_end,
  347. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  348. { ".int", "http://", url_tld_start, url_tld_end,
  349. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  350. { ".io", "http://", url_tld_start, url_tld_end,
  351. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  352. { ".iq", "http://", url_tld_start, url_tld_end,
  353. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  354. { ".ir", "http://", url_tld_start, url_tld_end,
  355. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  356. { ".is", "http://", url_tld_start, url_tld_end,
  357. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  358. { ".it", "http://", url_tld_start, url_tld_end,
  359. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  360. { ".je", "http://", url_tld_start, url_tld_end,
  361. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  362. { ".jm", "http://", url_tld_start, url_tld_end,
  363. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  364. { ".jo", "http://", url_tld_start, url_tld_end,
  365. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  366. { ".jobs", "http://", url_tld_start, url_tld_end,
  367. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  368. { ".jp", "http://", url_tld_start, url_tld_end,
  369. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  370. { ".ke", "http://", url_tld_start, url_tld_end,
  371. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  372. { ".kg", "http://", url_tld_start, url_tld_end,
  373. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  374. { ".kh", "http://", url_tld_start, url_tld_end,
  375. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  376. { ".ki", "http://", url_tld_start, url_tld_end,
  377. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  378. { ".km", "http://", url_tld_start, url_tld_end,
  379. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  380. { ".kn", "http://", url_tld_start, url_tld_end,
  381. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  382. { ".kp", "http://", url_tld_start, url_tld_end,
  383. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  384. { ".kr", "http://", url_tld_start, url_tld_end,
  385. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  386. { ".kw", "http://", url_tld_start, url_tld_end,
  387. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  388. { ".ky", "http://", url_tld_start, url_tld_end,
  389. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  390. { ".kz", "http://", url_tld_start, url_tld_end,
  391. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  392. { ".la", "http://", url_tld_start, url_tld_end,
  393. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  394. { ".lb", "http://", url_tld_start, url_tld_end,
  395. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  396. { ".lc", "http://", url_tld_start, url_tld_end,
  397. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  398. { ".li", "http://", url_tld_start, url_tld_end,
  399. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  400. { ".lk", "http://", url_tld_start, url_tld_end,
  401. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  402. { ".lr", "http://", url_tld_start, url_tld_end,
  403. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  404. { ".ls", "http://", url_tld_start, url_tld_end,
  405. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  406. { ".lt", "http://", url_tld_start, url_tld_end,
  407. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  408. { ".lu", "http://", url_tld_start, url_tld_end,
  409. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  410. { ".lv", "http://", url_tld_start, url_tld_end,
  411. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  412. { ".ly", "http://", url_tld_start, url_tld_end,
  413. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  414. { ".ma", "http://", url_tld_start, url_tld_end,
  415. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  416. { ".mc", "http://", url_tld_start, url_tld_end,
  417. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  418. { ".md", "http://", url_tld_start, url_tld_end,
  419. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  420. { ".me", "http://", url_tld_start, url_tld_end,
  421. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  422. { ".mg", "http://", url_tld_start, url_tld_end,
  423. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  424. { ".mh", "http://", url_tld_start, url_tld_end,
  425. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  426. { ".mil", "http://", url_tld_start, url_tld_end,
  427. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  428. { ".mk", "http://", url_tld_start, url_tld_end,
  429. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  430. { ".ml", "http://", url_tld_start, url_tld_end,
  431. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  432. { ".mm", "http://", url_tld_start, url_tld_end,
  433. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  434. { ".mn", "http://", url_tld_start, url_tld_end,
  435. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  436. { ".mo", "http://", url_tld_start, url_tld_end,
  437. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  438. { ".mobi", "http://", url_tld_start, url_tld_end,
  439. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  440. { ".mp", "http://", url_tld_start, url_tld_end,
  441. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  442. { ".mq", "http://", url_tld_start, url_tld_end,
  443. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  444. { ".mr", "http://", url_tld_start, url_tld_end,
  445. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  446. { ".ms", "http://", url_tld_start, url_tld_end,
  447. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  448. { ".mt", "http://", url_tld_start, url_tld_end,
  449. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  450. { ".mu", "http://", url_tld_start, url_tld_end,
  451. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  452. { ".museum", "http://", url_tld_start, url_tld_end,
  453. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  454. { ".mv", "http://", url_tld_start, url_tld_end,
  455. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  456. { ".mw", "http://", url_tld_start, url_tld_end,
  457. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  458. { ".mx", "http://", url_tld_start, url_tld_end,
  459. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  460. { ".my", "http://", url_tld_start, url_tld_end,
  461. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  462. { ".mz", "http://", url_tld_start, url_tld_end,
  463. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  464. { ".na", "http://", url_tld_start, url_tld_end,
  465. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  466. { ".name", "http://", url_tld_start, url_tld_end,
  467. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  468. { ".nc", "http://", url_tld_start, url_tld_end,
  469. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  470. { ".ne", "http://", url_tld_start, url_tld_end,
  471. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  472. { ".net", "http://", url_tld_start, url_tld_end,
  473. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  474. { ".nf", "http://", url_tld_start, url_tld_end,
  475. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  476. { ".ng", "http://", url_tld_start, url_tld_end,
  477. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  478. { ".ni", "http://", url_tld_start, url_tld_end,
  479. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  480. { ".nl", "http://", url_tld_start, url_tld_end,
  481. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  482. { ".no", "http://", url_tld_start, url_tld_end,
  483. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  484. { ".np", "http://", url_tld_start, url_tld_end,
  485. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  486. { ".nr", "http://", url_tld_start, url_tld_end,
  487. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  488. { ".nu", "http://", url_tld_start, url_tld_end,
  489. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  490. { ".nz", "http://", url_tld_start, url_tld_end,
  491. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  492. { ".om", "http://", url_tld_start, url_tld_end,
  493. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  494. { ".org", "http://", url_tld_start, url_tld_end,
  495. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  496. { ".pa", "http://", url_tld_start, url_tld_end,
  497. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  498. { ".pe", "http://", url_tld_start, url_tld_end,
  499. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  500. { ".pf", "http://", url_tld_start, url_tld_end,
  501. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  502. { ".pg", "http://", url_tld_start, url_tld_end,
  503. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  504. { ".ph", "http://", url_tld_start, url_tld_end,
  505. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  506. { ".pk", "http://", url_tld_start, url_tld_end,
  507. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  508. { ".pl", "http://", url_tld_start, url_tld_end,
  509. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  510. { ".pm", "http://", url_tld_start, url_tld_end,
  511. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  512. { ".pn", "http://", url_tld_start, url_tld_end,
  513. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  514. { ".pr", "http://", url_tld_start, url_tld_end,
  515. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  516. { ".pro", "http://", url_tld_start, url_tld_end,
  517. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  518. { ".ps", "http://", url_tld_start, url_tld_end,
  519. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  520. { ".pt", "http://", url_tld_start, url_tld_end,
  521. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  522. { ".pw", "http://", url_tld_start, url_tld_end,
  523. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  524. { ".py", "http://", url_tld_start, url_tld_end,
  525. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  526. { ".qa", "http://", url_tld_start, url_tld_end,
  527. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  528. { ".re", "http://", url_tld_start, url_tld_end,
  529. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  530. { ".ro", "http://", url_tld_start, url_tld_end,
  531. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  532. { ".rs", "http://", url_tld_start, url_tld_end,
  533. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  534. { ".ru", "http://", url_tld_start, url_tld_end,
  535. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  536. { ".rw", "http://", url_tld_start, url_tld_end,
  537. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  538. { ".sa", "http://", url_tld_start, url_tld_end,
  539. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  540. { ".sb", "http://", url_tld_start, url_tld_end,
  541. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  542. { ".sc", "http://", url_tld_start, url_tld_end,
  543. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  544. { ".sd", "http://", url_tld_start, url_tld_end,
  545. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  546. { ".se", "http://", url_tld_start, url_tld_end,
  547. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  548. { ".sg", "http://", url_tld_start, url_tld_end,
  549. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  550. { ".sh", "http://", url_tld_start, url_tld_end,
  551. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  552. { ".si", "http://", url_tld_start, url_tld_end,
  553. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  554. { ".sj", "http://", url_tld_start, url_tld_end,
  555. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  556. { ".sk", "http://", url_tld_start, url_tld_end,
  557. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  558. { ".sl", "http://", url_tld_start, url_tld_end,
  559. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  560. { ".sm", "http://", url_tld_start, url_tld_end,
  561. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  562. { ".sn", "http://", url_tld_start, url_tld_end,
  563. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  564. { ".so", "http://", url_tld_start, url_tld_end,
  565. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  566. { ".sr", "http://", url_tld_start, url_tld_end,
  567. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  568. { ".st", "http://", url_tld_start, url_tld_end,
  569. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  570. { ".su", "http://", url_tld_start, url_tld_end,
  571. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  572. { ".sv", "http://", url_tld_start, url_tld_end,
  573. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  574. { ".sx", "http://", url_tld_start, url_tld_end,
  575. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  576. { ".sy", "http://", url_tld_start, url_tld_end,
  577. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  578. { ".sz", "http://", url_tld_start, url_tld_end,
  579. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  580. { ".tc", "http://", url_tld_start, url_tld_end,
  581. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  582. { ".td", "http://", url_tld_start, url_tld_end,
  583. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  584. { ".tel", "http://", url_tld_start, url_tld_end,
  585. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  586. { ".tf", "http://", url_tld_start, url_tld_end,
  587. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  588. { ".tg", "http://", url_tld_start, url_tld_end,
  589. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  590. { ".th", "http://", url_tld_start, url_tld_end,
  591. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  592. { ".tj", "http://", url_tld_start, url_tld_end,
  593. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  594. { ".tk", "http://", url_tld_start, url_tld_end,
  595. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  596. { ".tl", "http://", url_tld_start, url_tld_end,
  597. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  598. { ".tm", "http://", url_tld_start, url_tld_end,
  599. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  600. { ".tn", "http://", url_tld_start, url_tld_end,
  601. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  602. { ".to", "http://", url_tld_start, url_tld_end,
  603. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  604. { ".tp", "http://", url_tld_start, url_tld_end,
  605. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  606. { ".tr", "http://", url_tld_start, url_tld_end,
  607. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  608. { ".travel", "http://", url_tld_start, url_tld_end,
  609. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  610. { ".tt", "http://", url_tld_start, url_tld_end,
  611. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  612. { ".tv", "http://", url_tld_start, url_tld_end,
  613. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  614. { ".tw", "http://", url_tld_start, url_tld_end,
  615. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  616. { ".tz", "http://", url_tld_start, url_tld_end,
  617. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  618. { ".ua", "http://", url_tld_start, url_tld_end,
  619. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  620. { ".ug", "http://", url_tld_start, url_tld_end,
  621. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  622. { ".uk", "http://", url_tld_start, url_tld_end,
  623. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  624. { ".us", "http://", url_tld_start, url_tld_end,
  625. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  626. { ".uy", "http://", url_tld_start, url_tld_end,
  627. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  628. { ".uz", "http://", url_tld_start, url_tld_end,
  629. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  630. { ".va", "http://", url_tld_start, url_tld_end,
  631. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  632. { ".vc", "http://", url_tld_start, url_tld_end,
  633. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  634. { ".ve", "http://", url_tld_start, url_tld_end,
  635. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  636. { ".vg", "http://", url_tld_start, url_tld_end,
  637. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  638. { ".vi", "http://", url_tld_start, url_tld_end,
  639. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  640. { ".vn", "http://", url_tld_start, url_tld_end,
  641. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  642. { ".vu", "http://", url_tld_start, url_tld_end,
  643. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  644. { ".wf", "http://", url_tld_start, url_tld_end,
  645. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  646. { ".ws", "http://", url_tld_start, url_tld_end,
  647. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  648. { ".xxx", "http://", url_tld_start, url_tld_end,
  649. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  650. { ".ye", "http://", url_tld_start, url_tld_end,
  651. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  652. { ".yt", "http://", url_tld_start, url_tld_end,
  653. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  654. { ".za", "http://", url_tld_start, url_tld_end,
  655. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  656. { ".zm", "http://", url_tld_start, url_tld_end,
  657. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  658. { ".zw", "http://", url_tld_start, url_tld_end,
  659. URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH },
  660. /* Likely emails */
  661. { "@", "mailto://",url_email_start, url_email_end,
  662. URL_FLAG_NOHTML }
  663. };
  664. struct url_match_scanner {
  665. GArray *matchers;
  666. rspamd_trie_t *search_trie;
  667. rspamd_trie_t *tld_trie;
  668. };
  669. struct url_match_scanner *url_scanner = NULL;
  670. static guchar url_scanner_table[256] = {
  671. 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
  672. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  673. 24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160,
  674. 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128,
  675. 160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
  676. 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,192,
  677. 128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
  678. 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1,
  679. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  680. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  681. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  682. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  683. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  684. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  685. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  686. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
  687. };
  688. enum {
  689. IS_CTRL = (1 << 0),
  690. IS_ALPHA = (1 << 1),
  691. IS_DIGIT = (1 << 2),
  692. IS_LWSP = (1 << 3),
  693. IS_SPACE = (1 << 4),
  694. IS_SPECIAL = (1 << 5),
  695. IS_DOMAIN = (1 << 6),
  696. IS_URLSAFE = (1 << 7)
  697. };
  698. #define is_ctrl(x) ((url_scanner_table[(guchar)(x)] & IS_CTRL) != 0)
  699. #define is_lwsp(x) ((url_scanner_table[(guchar)(x)] & IS_LWSP) != 0)
  700. #define is_atom(x) ((url_scanner_table[(guchar)(x)] & (IS_SPECIAL | IS_SPACE | \
  701. IS_CTRL)) == 0)
  702. #define is_usersafe(x) ((url_scanner_table[(guchar)(x)] & (IS_CTRL | IS_SPACE)) == 0)
  703. #define is_alpha(x) ((url_scanner_table[(guchar)(x)] & IS_ALPHA) != 0)
  704. #define is_digit(x) ((url_scanner_table[(guchar)(x)] & IS_DIGIT) != 0)
  705. #define is_domain(x) ((url_scanner_table[(guchar)(x)] & IS_DOMAIN) != 0)
  706. #define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA | IS_DIGIT | \
  707. IS_URLSAFE)) != 0)
  708. void
  709. rspamd_unescape_uri (gchar *dst, const gchar *src, gsize size)
  710. {
  711. gchar *d, ch, c, decoded;
  712. const gchar *s;
  713. enum {
  714. sw_usual = 0,
  715. sw_quoted,
  716. sw_quoted_second
  717. } state;
  718. d = dst;
  719. s = src;
  720. state = 0;
  721. decoded = 0;
  722. while (size--) {
  723. ch = *s++;
  724. switch (state) {
  725. case sw_usual:
  726. if (ch == '%') {
  727. state = sw_quoted;
  728. break;
  729. }
  730. *d++ = ch;
  731. break;
  732. case sw_quoted:
  733. if (ch >= '0' && ch <= '9') {
  734. decoded = (ch - '0');
  735. state = sw_quoted_second;
  736. break;
  737. }
  738. c = (ch | 0x20);
  739. if (c >= 'a' && c <= 'f') {
  740. decoded = (c - 'a' + 10);
  741. state = sw_quoted_second;
  742. break;
  743. }
  744. /* the invalid quoted character */
  745. state = sw_usual;
  746. *d++ = ch;
  747. break;
  748. case sw_quoted_second:
  749. state = sw_usual;
  750. if (ch >= '0' && ch <= '9') {
  751. ch = ((decoded << 4) + ch - '0');
  752. *d++ = ch;
  753. break;
  754. }
  755. c = (u_char) (ch | 0x20);
  756. if (c >= 'a' && c <= 'f') {
  757. ch = ((decoded << 4) + c - 'a' + 10);
  758. *d++ = ch;
  759. break;
  760. }
  761. /* the invalid quoted character */
  762. break;
  763. }
  764. }
  765. *d = '\0';
  766. }
  767. const gchar *
  768. rspamd_url_strerror (enum uri_errno err)
  769. {
  770. switch (err) {
  771. case URI_ERRNO_OK:
  772. return "Parsing went well";
  773. case URI_ERRNO_EMPTY:
  774. return "The URI string was empty";
  775. case URI_ERRNO_INVALID_PROTOCOL:
  776. return "No protocol was found";
  777. case URI_ERRNO_BAD_FORMAT:
  778. return "Bad URL format";
  779. case URI_ERRNO_BAD_ENCODING:
  780. return "Invalid symbols encoded";
  781. case URI_ERRNO_INVALID_PORT:
  782. return "Port number is bad";
  783. }
  784. return NULL;
  785. }
  786. static void
  787. rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner)
  788. {
  789. FILE *f;
  790. struct url_matcher m;
  791. gchar *linebuf = NULL, *p;
  792. gsize buflen = 0, patlen;
  793. gssize r;
  794. gint flags;
  795. f = fopen (fname, "r");
  796. if (f == NULL) {
  797. msg_err ("cannot open TLD file %s: %s", fname, strerror (errno));
  798. return;
  799. }
  800. m.end = url_tld_end;
  801. m.start = url_tld_start;
  802. m.prefix = "http://";
  803. while ((r = getline (&linebuf, &buflen, f)) > 0) {
  804. if (linebuf[0] == '/' || g_ascii_isspace (linebuf[0])) {
  805. /* Skip comment or empty line */
  806. continue;
  807. }
  808. g_strchomp (linebuf);
  809. /* TODO: add support for ! patterns */
  810. if (linebuf[0] == '!') {
  811. msg_debug ("skip '!' patterns from parsing for now: %s", linebuf);
  812. continue;
  813. }
  814. flags = URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH;
  815. if (linebuf[0] == '*') {
  816. flags |= URL_FLAG_STAR_MATCH;
  817. p = strchr (linebuf, '.');
  818. if (p == NULL) {
  819. msg_err ("got bad star line, skip it: %s", linebuf);
  820. continue;
  821. }
  822. p ++;
  823. }
  824. else {
  825. p = linebuf;
  826. }
  827. patlen = strlen (p);
  828. m.pattern = g_malloc (patlen + 2);
  829. m.pattern[0] = '.';
  830. rspamd_strlcpy (&m.pattern[1], p, patlen + 1);
  831. g_array_append_val (url_scanner->matchers, m);
  832. }
  833. free (linebuf);
  834. fclose (f);
  835. }
  836. static void
  837. rspamd_url_add_static_matchers (GArray *matchers)
  838. {
  839. gint n = G_N_ELEMENTS (static_matchers);
  840. g_array_append_vals (matchers, static_matchers, n);
  841. }
  842. void
  843. rspamd_url_init (const gchar *tld_file)
  844. {
  845. guint i;
  846. gchar patbuf[128];
  847. struct url_matcher *m;
  848. if (url_scanner == NULL) {
  849. url_scanner = g_malloc (sizeof (struct url_match_scanner));
  850. url_scanner->matchers = g_array_new (FALSE, TRUE,
  851. sizeof (struct url_matcher));
  852. url_scanner->search_trie = rspamd_trie_create (TRUE);
  853. url_scanner->tld_trie = rspamd_trie_create (TRUE);
  854. rspamd_url_add_static_matchers (url_scanner->matchers);
  855. if (tld_file != NULL) {
  856. rspamd_url_parse_tld_file (tld_file, url_scanner);
  857. }
  858. else {
  859. msg_warn ("tld extension file is not specified, url matching is limited");
  860. }
  861. for (i = 0; i < url_scanner->matchers->len; i++) {
  862. m = &g_array_index (url_scanner->matchers, struct url_matcher, i);
  863. rspamd_trie_insert (url_scanner->search_trie, m->pattern, i);
  864. /* Also use it for TLD lookups */
  865. if (strcmp (m->prefix, "http://") == 0) {
  866. rspamd_trie_insert (url_scanner->tld_trie, m->pattern, i);
  867. }
  868. }
  869. }
  870. }
  871. #define SET_U(u, field) do { \
  872. if ((u) != NULL) { \
  873. (u)->field_set |= 1 << (field); \
  874. (u)->field_data[(field)].len = p - c; \
  875. (u)->field_data[(field)].off = c - str; \
  876. } \
  877. } while (0)
  878. static gint
  879. rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
  880. gchar const **end)
  881. {
  882. const gchar *p = str, *c = str, *last = str + len;
  883. gchar t;
  884. gint ret = 1;
  885. enum {
  886. parse_mailto,
  887. parse_slash,
  888. parse_slash_slash,
  889. parse_semicolon,
  890. parse_prefix_question,
  891. parse_destination,
  892. parse_equal,
  893. parse_user,
  894. parse_at,
  895. parse_domain,
  896. parse_suffix_question,
  897. parse_query
  898. } st = parse_mailto;
  899. while (p < last) {
  900. t = *p;
  901. switch (st) {
  902. case parse_mailto:
  903. if (t == ':') {
  904. st = parse_semicolon;
  905. SET_U (u, UF_SCHEMA);
  906. }
  907. p ++;
  908. break;
  909. case parse_semicolon:
  910. if (t == '/') {
  911. st = parse_slash;
  912. p ++;
  913. }
  914. else {
  915. st = parse_slash_slash;
  916. }
  917. break;
  918. case parse_slash:
  919. if (t == '/') {
  920. st = parse_slash_slash;
  921. }
  922. else {
  923. goto out;
  924. }
  925. p ++;
  926. break;
  927. case parse_slash_slash:
  928. if (t == '?') {
  929. st = parse_prefix_question;
  930. p ++;
  931. }
  932. else {
  933. c = p;
  934. st = parse_user;
  935. }
  936. break;
  937. case parse_prefix_question:
  938. if (t == 't') {
  939. /* XXX: accept only to= */
  940. st = parse_destination;
  941. }
  942. else {
  943. goto out;
  944. }
  945. break;
  946. case parse_destination:
  947. if (t == '=') {
  948. st = parse_equal;
  949. }
  950. p ++;
  951. break;
  952. case parse_equal:
  953. c = p;
  954. st = parse_user;
  955. break;
  956. case parse_user:
  957. if (t == '@') {
  958. if (p - c == 0) {
  959. goto out;
  960. }
  961. SET_U (u, UF_USERINFO);
  962. st = parse_at;
  963. }
  964. else if (!is_usersafe (t)) {
  965. goto out;
  966. }
  967. p ++;
  968. break;
  969. case parse_at:
  970. c = p;
  971. st = parse_domain;
  972. break;
  973. case parse_domain:
  974. if (t == '?') {
  975. SET_U (u, UF_HOST);
  976. st = parse_suffix_question;
  977. }
  978. else if (!is_domain (t) && t != '.' && t != '_') {
  979. goto out;
  980. }
  981. p ++;
  982. break;
  983. case parse_suffix_question:
  984. c = p;
  985. st = parse_query;
  986. break;
  987. case parse_query:
  988. if (!is_atom (t)) {
  989. goto out;
  990. }
  991. p ++;
  992. break;
  993. }
  994. }
  995. if (st == parse_domain) {
  996. if (p - c != 0) {
  997. SET_U (u, UF_HOST);
  998. ret = 0;
  999. }
  1000. }
  1001. else if (st == parse_query) {
  1002. if (p - c > 0) {
  1003. SET_U (u, UF_QUERY);
  1004. }
  1005. ret = 0;
  1006. }
  1007. out:
  1008. if (end != NULL) {
  1009. *end = p;
  1010. }
  1011. return ret;
  1012. }
  1013. static gint
  1014. rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
  1015. gchar const **end, gboolean strict)
  1016. {
  1017. const gchar *p = str, *c = str, *last = str + len, *slash = NULL;
  1018. gchar t;
  1019. gunichar uc;
  1020. glong pt;
  1021. gint ret = 1;
  1022. gboolean user_seen = FALSE;
  1023. enum {
  1024. parse_protocol,
  1025. parse_slash,
  1026. parse_slash_slash,
  1027. parse_semicolon,
  1028. parse_user,
  1029. parse_at,
  1030. parse_password_start,
  1031. parse_password,
  1032. parse_domain,
  1033. parse_port_password,
  1034. parse_port,
  1035. parse_suffix_slash,
  1036. parse_path,
  1037. parse_query,
  1038. parse_part
  1039. } st = parse_protocol;
  1040. while (p < last) {
  1041. t = *p;
  1042. switch (st) {
  1043. case parse_protocol:
  1044. if (t == ':') {
  1045. st = parse_semicolon;
  1046. SET_U (u, UF_SCHEMA);
  1047. }
  1048. else if (!g_ascii_isalnum (t) && t != '+' && t != '-') {
  1049. if (!strict && p > c) {
  1050. /* We might have some domain, but no protocol */
  1051. st = parse_domain;
  1052. p = c;
  1053. slash = c;
  1054. break;
  1055. }
  1056. else {
  1057. goto out;
  1058. }
  1059. }
  1060. p ++;
  1061. break;
  1062. case parse_semicolon:
  1063. if (t == '/') {
  1064. st = parse_slash;
  1065. p ++;
  1066. }
  1067. else {
  1068. st = parse_slash_slash;
  1069. }
  1070. break;
  1071. case parse_slash:
  1072. if (t == '/') {
  1073. st = parse_slash_slash;
  1074. }
  1075. else {
  1076. goto out;
  1077. }
  1078. p ++;
  1079. break;
  1080. case parse_slash_slash:
  1081. c = p;
  1082. st = parse_domain;
  1083. slash = p;
  1084. break;
  1085. case parse_user:
  1086. if (t == ':') {
  1087. if (p - c == 0) {
  1088. goto out;
  1089. }
  1090. SET_U (u, UF_USERINFO);
  1091. st = parse_password_start;
  1092. }
  1093. else if (t == '@') {
  1094. /* No password */
  1095. if (p - c == 0) {
  1096. goto out;
  1097. }
  1098. SET_U (u, UF_USERINFO);
  1099. st = parse_at;
  1100. }
  1101. else if (!g_ascii_isgraph (t)) {
  1102. goto out;
  1103. }
  1104. p ++;
  1105. break;
  1106. case parse_password_start:
  1107. if (t == '@') {
  1108. /* Empty password */
  1109. st = parse_at;
  1110. }
  1111. else {
  1112. c = p;
  1113. st = parse_password;
  1114. }
  1115. p ++;
  1116. break;
  1117. case parse_password:
  1118. if (t == '@') {
  1119. /* XXX: password is not stored */
  1120. st = parse_at;
  1121. }
  1122. else if (!g_ascii_isgraph (t)) {
  1123. goto out;
  1124. }
  1125. p ++;
  1126. break;
  1127. case parse_at:
  1128. c = p;
  1129. st = parse_domain;
  1130. break;
  1131. case parse_domain:
  1132. if (t == '/' || t == ':') {
  1133. if (p - c == 0) {
  1134. goto out;
  1135. }
  1136. if (t == '/') {
  1137. SET_U (u, UF_HOST);
  1138. st = parse_suffix_slash;
  1139. }
  1140. else if (!user_seen) {
  1141. /*
  1142. * Here we can have both port and password, hence we need
  1143. * to apply some heuristic here
  1144. */
  1145. st = parse_port_password;
  1146. }
  1147. else {
  1148. /*
  1149. * We can go only for parsing port here
  1150. */
  1151. SET_U (u, UF_HOST);
  1152. st = parse_port;
  1153. c = p + 1;
  1154. }
  1155. p ++;
  1156. }
  1157. else {
  1158. if (*p != '.' && *p != '-' && *p != '_') {
  1159. uc = g_utf8_get_char_validated (p, last - p);
  1160. if (uc == (gunichar)-1) {
  1161. /* Bad utf8 */
  1162. goto out;
  1163. }
  1164. if (!g_unichar_isalnum (uc)) {
  1165. /* Bad symbol */
  1166. if (strict) {
  1167. goto out;
  1168. }
  1169. else {
  1170. goto set;
  1171. }
  1172. }
  1173. p = g_utf8_next_char (p);
  1174. }
  1175. else {
  1176. p ++;
  1177. }
  1178. }
  1179. break;
  1180. case parse_port_password:
  1181. if (g_ascii_isdigit (t)) {
  1182. /* XXX: that breaks urls with passwords starting with number */
  1183. st = parse_port;
  1184. c = slash;
  1185. p --;
  1186. SET_U (u, UF_HOST);
  1187. p ++;
  1188. c = p;
  1189. }
  1190. else {
  1191. /* Rewind back */
  1192. p = slash;
  1193. c = slash;
  1194. user_seen = TRUE;
  1195. st = parse_user;
  1196. }
  1197. break;
  1198. case parse_port:
  1199. if (t == '/') {
  1200. pt = strtoul (c, NULL, 10);
  1201. if (pt == 0 || pt > 65535) {
  1202. goto out;
  1203. }
  1204. if (u != NULL) {
  1205. u->port = pt;
  1206. }
  1207. st = parse_suffix_slash;
  1208. }
  1209. else if (!g_ascii_isdigit (t)) {
  1210. if (strict || !g_ascii_isspace (t)) {
  1211. goto out;
  1212. }
  1213. else {
  1214. goto set;
  1215. }
  1216. }
  1217. p ++;
  1218. break;
  1219. case parse_suffix_slash:
  1220. if (t != '/') {
  1221. c = p;
  1222. st = parse_path;
  1223. }
  1224. else {
  1225. /* Skip extra slashes */
  1226. p ++;
  1227. }
  1228. break;
  1229. case parse_path:
  1230. if (t == '?') {
  1231. if (p - c != 0) {
  1232. SET_U (u, UF_PATH);
  1233. }
  1234. c = p + 1;
  1235. st = parse_query;
  1236. }
  1237. else if (!is_urlsafe (t)) {
  1238. if (strict) {
  1239. if (g_ascii_isspace (t)) {
  1240. goto set;
  1241. }
  1242. goto out;
  1243. }
  1244. else {
  1245. goto set;
  1246. }
  1247. }
  1248. p ++;
  1249. break;
  1250. case parse_query:
  1251. if (t == '#') {
  1252. if (p - c != 0) {
  1253. SET_U (u, UF_QUERY);
  1254. }
  1255. c = p + 1;
  1256. st = parse_part;
  1257. }
  1258. else if (!is_urlsafe (t)) {
  1259. if (strict) {
  1260. if (g_ascii_isspace (t)) {
  1261. goto set;
  1262. }
  1263. goto out;
  1264. }
  1265. else {
  1266. goto set;
  1267. }
  1268. }
  1269. p ++;
  1270. break;
  1271. case parse_part:
  1272. if (!is_urlsafe (t)) {
  1273. if (strict) {
  1274. if (g_ascii_isspace (t)) {
  1275. goto set;
  1276. }
  1277. goto out;
  1278. }
  1279. else {
  1280. goto set;
  1281. }
  1282. }
  1283. p ++;
  1284. break;
  1285. }
  1286. }
  1287. set:
  1288. /* Parse remaining */
  1289. switch (st) {
  1290. case parse_domain:
  1291. if (p - c == 0) {
  1292. goto out;
  1293. }
  1294. SET_U (u, UF_HOST);
  1295. ret = 0;
  1296. break;
  1297. case parse_port:
  1298. pt = strtoul (c, NULL, 10);
  1299. if (pt == 0 || pt > 65535) {
  1300. goto out;
  1301. }
  1302. if (u != NULL) {
  1303. u->port = pt;
  1304. }
  1305. ret = 0;
  1306. break;
  1307. case parse_suffix_slash:
  1308. /* Url ends with '/' */
  1309. ret = 0;
  1310. break;
  1311. case parse_path:
  1312. if (p - c > 0) {
  1313. SET_U (u, UF_PATH);
  1314. }
  1315. ret = 0;
  1316. break;
  1317. case parse_query:
  1318. if (p - c > 0) {
  1319. SET_U (u, UF_QUERY);
  1320. }
  1321. ret = 0;
  1322. break;
  1323. case parse_part:
  1324. if (p - c > 0) {
  1325. SET_U (u, UF_FRAGMENT);
  1326. }
  1327. ret = 0;
  1328. break;
  1329. default:
  1330. /* Error state */
  1331. ret = 1;
  1332. break;
  1333. }
  1334. out:
  1335. if (end != NULL) {
  1336. *end = p;
  1337. }
  1338. return ret;
  1339. }
  1340. #undef SET_U
  1341. enum uri_errno
  1342. rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
  1343. rspamd_mempool_t *pool)
  1344. {
  1345. struct http_parser_url u;
  1346. gchar *p, *comp;
  1347. const gchar *end;
  1348. guint i, complen, ret;
  1349. const struct {
  1350. enum rspamd_url_protocol proto;
  1351. const gchar *name;
  1352. gsize len;
  1353. } protocols[] = {
  1354. {
  1355. .proto = PROTOCOL_FILE,
  1356. .name = "file",
  1357. .len = 4
  1358. },
  1359. {
  1360. .proto = PROTOCOL_FTP,
  1361. .name = "ftp",
  1362. .len = 3
  1363. },
  1364. {
  1365. .proto = PROTOCOL_HTTP,
  1366. .name = "http",
  1367. .len = 4
  1368. },
  1369. {
  1370. .proto = PROTOCOL_HTTPS,
  1371. .name = "https",
  1372. .len = 5
  1373. },
  1374. {
  1375. .proto = PROTOCOL_MAILTO,
  1376. .name = "mailto",
  1377. .len = 6
  1378. },
  1379. {
  1380. .proto = PROTOCOL_UNKNOWN,
  1381. .name = NULL,
  1382. .len = 0
  1383. }
  1384. };
  1385. memset (uri, 0, sizeof (*uri));
  1386. memset (&u, 0, sizeof (u));
  1387. if (*uristring == '\0') {
  1388. return URI_ERRNO_EMPTY;
  1389. }
  1390. p = uristring;
  1391. if (len > sizeof ("mailto:") - 1) {
  1392. /* For mailto: urls we also need to add slashes to make it a valid URL */
  1393. if (g_ascii_strncasecmp (p, "mailto:", sizeof ("mailto:") - 1) == 0) {
  1394. ret = rspamd_mailto_parse (&u, uristring, len, &end);
  1395. }
  1396. else {
  1397. ret = rspamd_web_parse (&u, uristring, len, &end, TRUE);
  1398. }
  1399. }
  1400. else {
  1401. ret = rspamd_web_parse (&u, uristring, len, &end, TRUE);
  1402. }
  1403. if (ret != 0) {
  1404. return URI_ERRNO_BAD_FORMAT;
  1405. }
  1406. if (end > uristring && (guint)(end - uristring) != len) {
  1407. /* We have extra data at the end of uri, so we are ignoring it for now */
  1408. p = rspamd_mempool_alloc (pool, end - uristring + 1);
  1409. rspamd_strlcpy (p, uristring, end - uristring + 1);
  1410. len = end - uristring ;
  1411. }
  1412. for (i = 0; i < UF_MAX; i ++) {
  1413. if (u.field_set & (1 << i)) {
  1414. comp = p + u.field_data[i].off;
  1415. complen = u.field_data[i].len;
  1416. switch (i) {
  1417. case UF_SCHEMA:
  1418. uri->protocollen = u.field_data[i].len;
  1419. break;
  1420. case UF_HOST:
  1421. uri->host = comp;
  1422. uri->hostlen = complen;
  1423. break;
  1424. case UF_PATH:
  1425. uri->data = comp;
  1426. uri->datalen = complen;
  1427. break;
  1428. case UF_QUERY:
  1429. uri->query = comp;
  1430. uri->querylen = complen;
  1431. break;
  1432. case UF_FRAGMENT:
  1433. uri->fragment = comp;
  1434. uri->fragmentlen = complen;
  1435. break;
  1436. case UF_USERINFO:
  1437. uri->user = comp;
  1438. uri->userlen = complen;
  1439. break;
  1440. default:
  1441. break;
  1442. }
  1443. }
  1444. }
  1445. if (!uri->hostlen) {
  1446. return URI_ERRNO_BAD_FORMAT;
  1447. }
  1448. /* Now decode url symbols */
  1449. uri->string = p;
  1450. rspamd_unescape_uri (uri->string, uri->string, len);
  1451. rspamd_str_lc (uri->string, uri->protocollen);
  1452. rspamd_str_lc_utf8 (uri->host, uri->hostlen);
  1453. uri->protocol = PROTOCOL_UNKNOWN;
  1454. for (i = 0; i < G_N_ELEMENTS (protocols); i ++) {
  1455. if (uri->protocollen == protocols[i].len) {
  1456. if (memcmp (uri->string, protocols[i].name, uri->protocollen) == 0) {
  1457. uri->protocol = i;
  1458. break;
  1459. }
  1460. }
  1461. }
  1462. if (uri->protocol == PROTOCOL_UNKNOWN) {
  1463. return URI_ERRNO_INVALID_PROTOCOL;
  1464. }
  1465. return URI_ERRNO_OK;
  1466. }
  1467. static const gchar url_braces[] = {
  1468. '(', ')',
  1469. '{', '}',
  1470. '[', ']',
  1471. '<', '>',
  1472. '|', '|',
  1473. '\'', '\''
  1474. };
  1475. static gboolean
  1476. is_open_brace (gchar c)
  1477. {
  1478. if (c == '(' ||
  1479. c == '{' ||
  1480. c == '[' ||
  1481. c == '<' ||
  1482. c == '|' ||
  1483. c == '\'') {
  1484. return TRUE;
  1485. }
  1486. return FALSE;
  1487. }
  1488. static gboolean
  1489. url_file_start (const gchar *begin,
  1490. const gchar *end,
  1491. const gchar *pos,
  1492. url_match_t *match)
  1493. {
  1494. match->m_begin = pos;
  1495. return TRUE;
  1496. }
  1497. static gboolean
  1498. url_file_end (const gchar *begin,
  1499. const gchar *end,
  1500. const gchar *pos,
  1501. url_match_t *match)
  1502. {
  1503. const gchar *p;
  1504. gchar stop;
  1505. guint i;
  1506. p = pos + strlen (match->pattern);
  1507. stop = *p;
  1508. if (*p == '/') {
  1509. p++;
  1510. }
  1511. for (i = 0; i < G_N_ELEMENTS (url_braces) / 2; i += 2) {
  1512. if (*p == url_braces[i]) {
  1513. stop = url_braces[i + 1];
  1514. break;
  1515. }
  1516. }
  1517. while (p < end && *p != stop && is_urlsafe (*p)) {
  1518. p++;
  1519. }
  1520. if (p == begin) {
  1521. return FALSE;
  1522. }
  1523. match->m_len = p - match->m_begin;
  1524. return TRUE;
  1525. }
  1526. static gboolean
  1527. url_tld_start (const gchar *begin,
  1528. const gchar *end,
  1529. const gchar *pos,
  1530. url_match_t *match)
  1531. {
  1532. const gchar *p = pos;
  1533. /* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */
  1534. while (p >= begin) {
  1535. if ((!is_domain (*p) && *p != '.' &&
  1536. *p != '/') || g_ascii_isspace (*p)) {
  1537. p++;
  1538. if (!g_ascii_isalnum (*p)) {
  1539. /* Urls cannot start with strange symbols */
  1540. return FALSE;
  1541. }
  1542. match->m_begin = p;
  1543. return TRUE;
  1544. }
  1545. else if (p == begin && p != pos) {
  1546. match->m_begin = p;
  1547. return TRUE;
  1548. }
  1549. else if (*p == '.') {
  1550. if (p == begin) {
  1551. /* Urls cannot start with a dot */
  1552. return FALSE;
  1553. }
  1554. if (!g_ascii_isalnum (p[1])) {
  1555. /* Wrong we have an invalid character after dot */
  1556. return FALSE;
  1557. }
  1558. }
  1559. else if (*p == '/') {
  1560. /* Urls cannot contain '/' in their body */
  1561. return FALSE;
  1562. }
  1563. p--;
  1564. }
  1565. return FALSE;
  1566. }
  1567. static gboolean
  1568. url_tld_end (const gchar *begin,
  1569. const gchar *end,
  1570. const gchar *pos,
  1571. url_match_t *match)
  1572. {
  1573. const gchar *p;
  1574. /* A url must be finished by tld, so it must be followed by space character */
  1575. p = pos + strlen (match->pattern);
  1576. if (p == end || g_ascii_isspace (*p) || *p == ',') {
  1577. match->m_len = p - match->m_begin;
  1578. return TRUE;
  1579. }
  1580. else if (*p == '/' || *p == ':') {
  1581. /* Parse arguments, ports by normal way by url default function */
  1582. p = match->m_begin;
  1583. /* Check common prefix */
  1584. if (g_ascii_strncasecmp (p, "http://", sizeof ("http://") - 1) == 0) {
  1585. return url_web_end (begin,
  1586. end,
  1587. match->m_begin + sizeof ("http://") - 1,
  1588. match);
  1589. }
  1590. else {
  1591. return url_web_end (begin, end, match->m_begin, match);
  1592. }
  1593. }
  1594. return FALSE;
  1595. }
  1596. static gboolean
  1597. url_web_start (const gchar *begin,
  1598. const gchar *end,
  1599. const gchar *pos,
  1600. url_match_t *match)
  1601. {
  1602. /* Check what we have found */
  1603. if (pos > begin &&
  1604. (g_ascii_strncasecmp (pos, "www",
  1605. 3) == 0 || g_ascii_strncasecmp (pos, "ftp", 3) == 0)) {
  1606. if (!is_open_brace (*(pos - 1)) && !g_ascii_isspace (*(pos - 1))) {
  1607. return FALSE;
  1608. }
  1609. }
  1610. if (*pos == '.') {
  1611. /* Urls cannot start with . */
  1612. return FALSE;
  1613. }
  1614. match->m_begin = pos;
  1615. return TRUE;
  1616. }
  1617. static gboolean
  1618. url_web_end (const gchar *begin,
  1619. const gchar *end,
  1620. const gchar *pos,
  1621. url_match_t *match)
  1622. {
  1623. const gchar *last = NULL;
  1624. if (rspamd_web_parse (NULL, pos, end - pos, &last, FALSE) != 0) {
  1625. return FALSE;
  1626. }
  1627. match->m_len = (last - pos);
  1628. return TRUE;
  1629. }
  1630. static gboolean
  1631. url_email_start (const gchar *begin,
  1632. const gchar *end,
  1633. const gchar *pos,
  1634. url_match_t *match)
  1635. {
  1636. const gchar *p;
  1637. /* Check what we have found */
  1638. if (pos > begin && *pos == '@') {
  1639. /* Try to extract it with username */
  1640. p = pos - 1;
  1641. while (p > begin && is_atom (*p)) {
  1642. p--;
  1643. }
  1644. if (!is_atom (*p) && p != pos - 1) {
  1645. match->m_begin = p + 1;
  1646. return TRUE;
  1647. }
  1648. else if (p == begin) {
  1649. match->m_begin = p;
  1650. return TRUE;
  1651. }
  1652. }
  1653. else {
  1654. p = pos + strlen (match->pattern);
  1655. if (is_atom (*p)) {
  1656. match->m_begin = pos;
  1657. return TRUE;
  1658. }
  1659. }
  1660. return FALSE;
  1661. }
  1662. static gboolean
  1663. url_email_end (const gchar *begin,
  1664. const gchar *end,
  1665. const gchar *pos,
  1666. url_match_t *match)
  1667. {
  1668. const gchar *p;
  1669. gboolean got_at = FALSE;
  1670. p = pos + strlen (match->pattern);
  1671. if (*pos == '@') {
  1672. got_at = TRUE;
  1673. }
  1674. while (p < end && (is_domain (*p) || *p == '_'
  1675. || (*p == '@' && !got_at) ||
  1676. *p == '.')) {
  1677. if (*p == '@') {
  1678. got_at = TRUE;
  1679. }
  1680. p++;
  1681. }
  1682. /* Strip strange symbols at the end */
  1683. if (got_at) {
  1684. while (p >= match->m_begin &&
  1685. (!is_domain (*p) || *p == '.' || *p == '_')) {
  1686. p --;
  1687. }
  1688. p ++;
  1689. }
  1690. match->m_len = p - match->m_begin;
  1691. match->add_prefix = TRUE;
  1692. return got_at;
  1693. }
  1694. void
  1695. rspamd_url_text_extract (rspamd_mempool_t * pool,
  1696. struct rspamd_task *task,
  1697. struct mime_text_part *part,
  1698. gboolean is_html)
  1699. {
  1700. gint rc;
  1701. gchar *url_str = NULL, *url_start, *url_end;
  1702. struct rspamd_url *new;
  1703. struct process_exception *ex;
  1704. gchar *p, *end, *begin;
  1705. if (part->content == NULL || part->content->len == 0) {
  1706. msg_warn ("got empty text part");
  1707. return;
  1708. }
  1709. begin = part->content->data;
  1710. end = begin + part->content->len;
  1711. p = begin;
  1712. while (p < end) {
  1713. if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
  1714. is_html)) {
  1715. if (url_str != NULL) {
  1716. new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
  1717. ex =
  1718. rspamd_mempool_alloc0 (pool,
  1719. sizeof (struct process_exception));
  1720. if (new != NULL) {
  1721. g_strstrip (url_str);
  1722. rc = rspamd_url_parse (new, url_str, strlen (url_str), pool);
  1723. if (rc == URI_ERRNO_OK &&
  1724. new->hostlen > 0) {
  1725. ex->pos = url_start - begin;
  1726. ex->len = url_end - url_start;
  1727. if (new->protocol == PROTOCOL_MAILTO) {
  1728. if (new->userlen > 0) {
  1729. if (!g_tree_lookup (task->emails, new)) {
  1730. g_tree_insert (task->emails, new, new);
  1731. }
  1732. }
  1733. }
  1734. else {
  1735. if (!g_tree_lookup (task->urls, new)) {
  1736. g_tree_insert (task->urls, new, new);
  1737. }
  1738. }
  1739. part->urls_offset = g_list_prepend (
  1740. part->urls_offset,
  1741. ex);
  1742. }
  1743. else if (rc != URI_ERRNO_OK) {
  1744. msg_info ("extract of url '%s' failed: %s",
  1745. url_str,
  1746. rspamd_url_strerror (rc));
  1747. }
  1748. }
  1749. }
  1750. }
  1751. else {
  1752. break;
  1753. }
  1754. p = url_end + 1;
  1755. }
  1756. /* Handle offsets of this part */
  1757. if (part->urls_offset != NULL) {
  1758. part->urls_offset = g_list_reverse (part->urls_offset);
  1759. rspamd_mempool_add_destructor (task->task_pool,
  1760. (rspamd_mempool_destruct_t)g_list_free, part->urls_offset);
  1761. }
  1762. }
  1763. gboolean
  1764. rspamd_url_find (rspamd_mempool_t *pool,
  1765. const gchar *begin,
  1766. gsize len,
  1767. gchar **start,
  1768. gchar **fin,
  1769. gchar **url_str,
  1770. gboolean is_html)
  1771. {
  1772. const gchar *end, *pos;
  1773. gint idx, l;
  1774. struct url_matcher *matcher;
  1775. url_match_t m;
  1776. end = begin + len;
  1777. if ((pos =
  1778. rspamd_trie_lookup (url_scanner->search_trie, begin, len,
  1779. &idx)) == NULL) {
  1780. return FALSE;
  1781. }
  1782. else {
  1783. matcher = &g_array_index (url_scanner->matchers, struct url_matcher, idx);
  1784. if ((matcher->flags & URL_FLAG_NOHTML) && is_html) {
  1785. /* Do not try to match non-html like urls in html texts */
  1786. return FALSE;
  1787. }
  1788. m.pattern = matcher->pattern;
  1789. m.prefix = matcher->prefix;
  1790. m.add_prefix = FALSE;
  1791. if (matcher->start (begin, end, pos,
  1792. &m) && matcher->end (begin, end, pos, &m)) {
  1793. if (m.add_prefix || matcher->prefix[0] != '\0') {
  1794. l = m.m_len + 1 + strlen (m.prefix);
  1795. *url_str = rspamd_mempool_alloc (pool, l);
  1796. rspamd_snprintf (*url_str,
  1797. l,
  1798. "%s%*s",
  1799. m.prefix,
  1800. m.m_len,
  1801. m.m_begin);
  1802. }
  1803. else {
  1804. *url_str = rspamd_mempool_alloc (pool, m.m_len + 1);
  1805. memcpy (*url_str, m.m_begin, m.m_len);
  1806. (*url_str)[m.m_len] = '\0';
  1807. }
  1808. if (start != NULL) {
  1809. *start = (gchar *)m.m_begin;
  1810. }
  1811. if (fin != NULL) {
  1812. *fin = (gchar *)m.m_begin + m.m_len;
  1813. }
  1814. }
  1815. else {
  1816. *url_str = NULL;
  1817. if (start != NULL) {
  1818. *start = (gchar *)pos;
  1819. }
  1820. if (fin != NULL) {
  1821. *fin = (gchar *)pos + strlen (m.prefix);
  1822. }
  1823. }
  1824. return TRUE;
  1825. }
  1826. return FALSE;
  1827. }
  1828. struct rspamd_url *
  1829. rspamd_url_get_next (rspamd_mempool_t *pool,
  1830. const gchar *start, gchar const **pos)
  1831. {
  1832. const gchar *p, *end;
  1833. gchar *url_str = NULL, *url_start, *url_end;
  1834. struct rspamd_url *new;
  1835. gint rc;
  1836. end = start + strlen (start);
  1837. if (pos == NULL || *pos == NULL) {
  1838. p = start;
  1839. }
  1840. else {
  1841. p = *pos;
  1842. }
  1843. if (p < end) {
  1844. if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
  1845. FALSE)) {
  1846. if (url_str != NULL) {
  1847. new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
  1848. if (new != NULL) {
  1849. g_strstrip (url_str);
  1850. rc = rspamd_url_parse (new, url_str, strlen (url_str), pool);
  1851. if (rc == URI_ERRNO_OK &&
  1852. new->hostlen > 0) {
  1853. if (new->protocol == PROTOCOL_MAILTO) {
  1854. if (new->userlen > 0) {
  1855. return new;
  1856. }
  1857. }
  1858. else {
  1859. return new;
  1860. }
  1861. }
  1862. else if (rc != URI_ERRNO_OK) {
  1863. msg_info ("extract of url '%s' failed: %s",
  1864. url_str,
  1865. rspamd_url_strerror (rc));
  1866. }
  1867. }
  1868. }
  1869. }
  1870. p = url_end + 1;
  1871. if (pos != NULL) {
  1872. *pos = p;
  1873. }
  1874. }
  1875. return NULL;
  1876. }
  1877. /*
  1878. * vi: ts=4
  1879. */