You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

url.h 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /* URL check functions */
  17. #ifndef URL_H
  18. #define URL_H
  19. #include "config.h"
  20. #include "mem_pool.h"
  21. #include "khash.h"
  22. #include "fstring.h"
  23. #include "libutil/cxx/utf8_util.h"
  24. #ifdef __cplusplus
  25. extern "C" {
  26. #endif
  27. struct rspamd_task;
  28. struct rspamd_mime_text_part;
  29. enum rspamd_url_flags {
  30. RSPAMD_URL_FLAG_PHISHED = 1u << 0u,
  31. RSPAMD_URL_FLAG_NUMERIC = 1u << 1u,
  32. RSPAMD_URL_FLAG_OBSCURED = 1u << 2u,
  33. RSPAMD_URL_FLAG_REDIRECTED = 1u << 3u,
  34. RSPAMD_URL_FLAG_HTML_DISPLAYED = 1u << 4u,
  35. RSPAMD_URL_FLAG_FROM_TEXT = 1u << 5u,
  36. RSPAMD_URL_FLAG_SUBJECT = 1u << 6u,
  37. RSPAMD_URL_FLAG_HOSTENCODED = 1u << 7u,
  38. RSPAMD_URL_FLAG_SCHEMAENCODED = 1u << 8u,
  39. RSPAMD_URL_FLAG_PATHENCODED = 1u << 9u,
  40. RSPAMD_URL_FLAG_QUERYENCODED = 1u << 10u,
  41. RSPAMD_URL_FLAG_MISSINGSLASHES = 1u << 11u,
  42. RSPAMD_URL_FLAG_IDN = 1u << 12u,
  43. RSPAMD_URL_FLAG_HAS_PORT = 1u << 13u,
  44. RSPAMD_URL_FLAG_HAS_USER = 1u << 14u,
  45. RSPAMD_URL_FLAG_SCHEMALESS = 1u << 15u,
  46. RSPAMD_URL_FLAG_UNNORMALISED = 1u << 16u,
  47. RSPAMD_URL_FLAG_ZW_SPACES = 1u << 17u,
  48. RSPAMD_URL_FLAG_DISPLAY_URL = 1u << 18u,
  49. RSPAMD_URL_FLAG_IMAGE = 1u << 19u,
  50. RSPAMD_URL_FLAG_QUERY = 1u << 20u,
  51. RSPAMD_URL_FLAG_CONTENT = 1u << 21u,
  52. RSPAMD_URL_FLAG_NO_TLD = 1u << 22u,
  53. RSPAMD_URL_FLAG_TRUNCATED = 1u << 23u,
  54. RSPAMD_URL_FLAG_REDIRECT_TARGET = 1u << 24u,
  55. RSPAMD_URL_FLAG_INVISIBLE = 1u << 25u,
  56. RSPAMD_URL_FLAG_SPECIAL = 1u << 26u,
  57. };
  58. #define RSPAMD_URL_MAX_FLAG_SHIFT (26u)
  59. struct rspamd_url_tag {
  60. const char *data;
  61. struct rspamd_url_tag *prev, *next;
  62. };
  63. struct rspamd_url_ext;
  64. /**
  65. * URL structure
  66. */
  67. struct rspamd_url {
  68. char *string;
  69. char *raw;
  70. struct rspamd_url_ext *ext;
  71. uint32_t flags;
  72. uint8_t protocol;
  73. uint8_t protocollen;
  74. uint16_t hostshift;
  75. uint16_t datashift;
  76. uint16_t queryshift;
  77. uint16_t fragmentshift;
  78. uint16_t tldshift;
  79. uint16_t usershift;
  80. uint16_t userlen;
  81. uint16_t hostlen;
  82. uint16_t datalen;
  83. uint16_t querylen;
  84. uint16_t fragmentlen;
  85. uint16_t tldlen;
  86. uint16_t count;
  87. uint16_t urllen;
  88. uint16_t rawlen;
  89. /* Absolute order of the URL in a message */
  90. uint16_t order;
  91. /* Order of the URL in a specific part of message */
  92. uint16_t part_order;
  93. };
  94. /**
  95. * Rarely used url fields
  96. */
  97. struct rspamd_url_ext {
  98. char *visible_part;
  99. struct rspamd_url *linked_url;
  100. uint16_t port;
  101. };
  102. #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
  103. #define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift)
  104. #define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
  105. #define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
  106. #define rspamd_url_tld_unsafe(u) ((u)->string + (u)->tldshift)
  107. #define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift)
  108. #define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift)
  109. #define rspamd_url_fragment_unsafe(u) ((u)->string + (u)->fragmentshift)
  110. enum uri_errno {
  111. URI_ERRNO_OK = 0, /* Parsing went well */
  112. URI_ERRNO_EMPTY, /* The URI string was empty */
  113. URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
  114. URI_ERRNO_INVALID_PORT, /* Port number is bad */
  115. URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */
  116. URI_ERRNO_BAD_FORMAT,
  117. URI_ERRNO_TLD_MISSING,
  118. URI_ERRNO_HOST_MISSING,
  119. URI_ERRNO_TOO_LONG,
  120. };
  121. enum rspamd_url_protocol {
  122. PROTOCOL_FILE = 1u << 0u,
  123. PROTOCOL_FTP = 1u << 1u,
  124. PROTOCOL_HTTP = 1u << 2u,
  125. PROTOCOL_HTTPS = 1u << 3u,
  126. PROTOCOL_MAILTO = 1u << 4u,
  127. PROTOCOL_TELEPHONE = 1u << 5u,
  128. PROTOCOL_UNKNOWN = 1u << 7u,
  129. };
  130. enum rspamd_url_parse_flags {
  131. RSPAMD_URL_PARSE_TEXT = 0u,
  132. RSPAMD_URL_PARSE_HREF = (1u << 0u),
  133. RSPAMD_URL_PARSE_CHECK = (1u << 1u),
  134. };
  135. enum rspamd_url_find_type {
  136. RSPAMD_URL_FIND_ALL = 0,
  137. RSPAMD_URL_FIND_STRICT,
  138. };
  139. /**
  140. * Initialize url library
  141. * @param cfg
  142. */
  143. void rspamd_url_init(const char *tld_file);
  144. void rspamd_url_deinit(void);
  145. /*
  146. * Parse urls inside text
  147. * @param pool memory pool
  148. * @param task task object
  149. * @param part current text part
  150. * @param is_html turn on html heuristic
  151. */
  152. void rspamd_url_text_extract(rspamd_mempool_t *pool,
  153. struct rspamd_task *task,
  154. struct rspamd_mime_text_part *part,
  155. uint16_t *cur_order,
  156. enum rspamd_url_find_type how);
  157. /*
  158. * Parse a single url into an uri structure
  159. * @param pool memory pool
  160. * @param uristring text form of url
  161. * @param uri url object, must be pre allocated
  162. */
  163. enum uri_errno rspamd_url_parse(struct rspamd_url *uri,
  164. char *uristring,
  165. gsize len,
  166. rspamd_mempool_t *pool,
  167. enum rspamd_url_parse_flags flags);
  168. /*
  169. * Try to extract url from a text
  170. * @param pool memory pool
  171. * @param begin begin of text
  172. * @param len length of text
  173. * @param start storage for start position of url found (or NULL)
  174. * @param end storage for end position of url found (or NULL)
  175. * @param url_str storage for url string(or NULL)
  176. * @return TRUE if url is found in specified text
  177. */
  178. gboolean rspamd_url_find(rspamd_mempool_t *pool,
  179. const char *begin, gsize len,
  180. char **url_str,
  181. enum rspamd_url_find_type how,
  182. goffset *url_pos,
  183. gboolean *prefix_added);
  184. /*
  185. * Return text representation of url parsing error
  186. */
  187. const char *rspamd_url_strerror(int err);
  188. /**
  189. * Find TLD for a specified host string
  190. * @param in input host
  191. * @param inlen length of input
  192. * @param out output rspamd_ftok_t with tld position
  193. * @return TRUE if tld has been found
  194. */
  195. gboolean rspamd_url_find_tld(const char *in, gsize inlen, rspamd_ftok_t *out);
  196. typedef gboolean (*url_insert_function)(struct rspamd_url *url,
  197. gsize start_offset, gsize end_offset, void *ud);
  198. /**
  199. * Search for multiple urls in text and call `func` for each url found
  200. * @param pool
  201. * @param in
  202. * @param inlen
  203. * @param is_html
  204. * @param func
  205. * @param ud
  206. */
  207. void rspamd_url_find_multiple(rspamd_mempool_t *pool,
  208. const char *in, gsize inlen,
  209. enum rspamd_url_find_type how,
  210. GPtrArray *nlines,
  211. url_insert_function func,
  212. gpointer ud);
  213. /**
  214. * Search for a single url in text and call `func` for each url found
  215. * @param pool
  216. * @param in
  217. * @param inlen
  218. * @param is_html
  219. * @param func
  220. * @param ud
  221. */
  222. void rspamd_url_find_single(rspamd_mempool_t *pool,
  223. const char *in, gsize inlen,
  224. enum rspamd_url_find_type how,
  225. url_insert_function func,
  226. gpointer ud);
  227. /**
  228. * Generic callback to insert URLs into rspamd_task
  229. * @param url
  230. * @param start_offset
  231. * @param end_offset
  232. * @param ud
  233. */
  234. gboolean rspamd_url_task_subject_callback(struct rspamd_url *url,
  235. gsize start_offset,
  236. gsize end_offset, gpointer ud);
  237. /**
  238. * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
  239. * @param dst
  240. * @param src
  241. * @param size
  242. * @return
  243. */
  244. gsize rspamd_url_decode(char *dst, const char *src, gsize size);
  245. /**
  246. * Encode url if needed. In this case, memory is allocated from the specific pool.
  247. * Returns pointer to begin and encoded length in `dlen`
  248. * @param url
  249. * @param pool
  250. * @return
  251. */
  252. const char *rspamd_url_encode(struct rspamd_url *url, gsize *dlen,
  253. rspamd_mempool_t *pool);
  254. /**
  255. * Returns if a character is domain character
  256. * @param c
  257. * @return
  258. */
  259. gboolean rspamd_url_is_domain(int c);
  260. /**
  261. * Returns symbolic name for protocol
  262. * @param proto
  263. * @return
  264. */
  265. const char *rspamd_url_protocol_name(enum rspamd_url_protocol proto);
  266. /**
  267. * Converts string to a numeric protocol
  268. * @param str
  269. * @return
  270. */
  271. enum rspamd_url_protocol rspamd_url_protocol_from_string(const char *str);
  272. /**
  273. * Converts string to a url flag
  274. * @param str
  275. * @param flag
  276. * @return
  277. */
  278. bool rspamd_url_flag_from_string(const char *str, int *flag);
  279. /**
  280. * Converts url flag to a string
  281. * @param flag
  282. * @return
  283. */
  284. const char *rspamd_url_flag_to_string(int flag);
  285. /* Defines sets of urls indexed by url as is */
  286. KHASH_DECLARE(rspamd_url_hash, struct rspamd_url *, char);
  287. KHASH_DECLARE(rspamd_url_host_hash, struct rspamd_url *, char);
  288. /* Convenience functions for url sets */
  289. /**
  290. * Add an url to set or increase the existing url count
  291. * @param set
  292. * @param u
  293. * @return true if a new url has been added
  294. */
  295. bool rspamd_url_set_add_or_increase(khash_t(rspamd_url_hash) * set,
  296. struct rspamd_url *u,
  297. bool enforce_replace);
  298. /**
  299. * Same as rspamd_url_set_add_or_increase but returns the existing url if found
  300. * @param set
  301. * @param u
  302. * @return
  303. */
  304. struct rspamd_url *rspamd_url_set_add_or_return(khash_t(rspamd_url_hash) * set,
  305. struct rspamd_url *u);
  306. /**
  307. * Helper for url host set
  308. * @param set
  309. * @param u
  310. * @return
  311. */
  312. bool rspamd_url_host_set_add(khash_t(rspamd_url_host_hash) * set,
  313. struct rspamd_url *u);
  314. /**
  315. * Checks if a url is in set
  316. * @param set
  317. * @param u
  318. * @return
  319. */
  320. bool rspamd_url_set_has(khash_t(rspamd_url_hash) * set, struct rspamd_url *u);
  321. bool rspamd_url_host_set_has(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u);
  322. /**
  323. * Compares two urls (similar to C comparison functions) lexicographically
  324. * @param u1
  325. * @param u2
  326. * @return
  327. */
  328. int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2);
  329. /**
  330. * Same but used for qsort to sort `struct rspamd_url *[]` array
  331. * @param u1
  332. * @param u2
  333. * @return
  334. */
  335. int rspamd_url_cmp_qsort(const void *u1, const void *u2);
  336. /**
  337. * Returns a port for some url
  338. * @param u
  339. * @return
  340. */
  341. static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port(struct rspamd_url *u)
  342. {
  343. if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) {
  344. return u->ext->port;
  345. }
  346. else {
  347. /* Assume standard port */
  348. if (u->protocol == PROTOCOL_HTTPS) {
  349. return 443;
  350. }
  351. else {
  352. return 80;
  353. }
  354. }
  355. }
  356. /**
  357. * Returns a port for some url if it is set
  358. * @param u
  359. * @return
  360. */
  361. static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port_if_special(struct rspamd_url *u)
  362. {
  363. if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) {
  364. return u->ext->port;
  365. }
  366. return 0;
  367. }
  368. /**
  369. * Normalize unicode input and set out url flags as appropriate
  370. * @param pool
  371. * @param input
  372. * @param len_out (must be &var)
  373. * @param url_flags_out (must be just a var with no dereference)
  374. */
  375. #define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \
  376. do { \
  377. enum rspamd_utf8_normalise_result norm_res; \
  378. norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \
  379. if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \
  380. url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \
  381. } \
  382. if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { \
  383. url_flags_out |= RSPAMD_URL_FLAG_ZW_SPACES; \
  384. } \
  385. if (norm_res & (RSPAMD_UNICODE_NORM_ERROR)) { \
  386. url_flags_out |= RSPAMD_URL_FLAG_OBSCURED; \
  387. } \
  388. } while (0)
  389. #ifdef __cplusplus
  390. }
  391. #endif
  392. #endif