You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

url.h 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. /* URL check functions */
  2. #ifndef URL_H
  3. #define URL_H
  4. #include "config.h"
  5. #include "mem_pool.h"
  6. #include "khash.h"
  7. #include "fstring.h"
  8. #include "libutil/cxx/utf8_util.h"
  9. #ifdef __cplusplus
  10. extern "C" {
  11. #endif
  12. struct rspamd_task;
  13. struct rspamd_mime_text_part;
  14. enum rspamd_url_flags {
  15. RSPAMD_URL_FLAG_PHISHED = 1u << 0u,
  16. RSPAMD_URL_FLAG_NUMERIC = 1u << 1u,
  17. RSPAMD_URL_FLAG_OBSCURED = 1u << 2u,
  18. RSPAMD_URL_FLAG_REDIRECTED = 1u << 3u,
  19. RSPAMD_URL_FLAG_HTML_DISPLAYED = 1u << 4u,
  20. RSPAMD_URL_FLAG_FROM_TEXT = 1u << 5u,
  21. RSPAMD_URL_FLAG_SUBJECT = 1u << 6u,
  22. RSPAMD_URL_FLAG_HOSTENCODED = 1u << 7u,
  23. RSPAMD_URL_FLAG_SCHEMAENCODED = 1u << 8u,
  24. RSPAMD_URL_FLAG_PATHENCODED = 1u << 9u,
  25. RSPAMD_URL_FLAG_QUERYENCODED = 1u << 10u,
  26. RSPAMD_URL_FLAG_MISSINGSLASHES = 1u << 11u,
  27. RSPAMD_URL_FLAG_IDN = 1u << 12u,
  28. RSPAMD_URL_FLAG_HAS_PORT = 1u << 13u,
  29. RSPAMD_URL_FLAG_HAS_USER = 1u << 14u,
  30. RSPAMD_URL_FLAG_SCHEMALESS = 1u << 15u,
  31. RSPAMD_URL_FLAG_UNNORMALISED = 1u << 16u,
  32. RSPAMD_URL_FLAG_ZW_SPACES = 1u << 17u,
  33. RSPAMD_URL_FLAG_DISPLAY_URL = 1u << 18u,
  34. RSPAMD_URL_FLAG_IMAGE = 1u << 19u,
  35. RSPAMD_URL_FLAG_QUERY = 1u << 20u,
  36. RSPAMD_URL_FLAG_CONTENT = 1u << 21u,
  37. RSPAMD_URL_FLAG_NO_TLD = 1u << 22u,
  38. RSPAMD_URL_FLAG_TRUNCATED = 1u << 23u,
  39. RSPAMD_URL_FLAG_REDIRECT_TARGET = 1u << 24u,
  40. RSPAMD_URL_FLAG_INVISIBLE = 1u << 25u,
  41. RSPAMD_URL_FLAG_SPECIAL = 1u << 26u,
  42. };
  43. #define RSPAMD_URL_MAX_FLAG_SHIFT (26u)
  44. struct rspamd_url_tag {
  45. const gchar *data;
  46. struct rspamd_url_tag *prev, *next;
  47. };
  48. struct rspamd_url {
  49. gchar *string;
  50. gchar *raw;
  51. gchar *visible_part;
  52. struct rspamd_url *linked_url;
  53. guint32 flags;
  54. guint8 protocol;
  55. guint8 protocollen;
  56. guint16 port;
  57. guint16 usershift;
  58. guint16 hostshift;
  59. guint16 datashift;
  60. guint16 queryshift;
  61. guint16 fragmentshift;
  62. guint16 tldshift;
  63. guint16 userlen;
  64. guint16 hostlen;
  65. guint16 datalen;
  66. guint16 querylen;
  67. guint16 fragmentlen;
  68. guint16 tldlen;
  69. guint16 count;
  70. guint16 urllen;
  71. guint16 rawlen;
  72. };
  73. #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
  74. #define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift)
  75. #define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
  76. #define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
  77. #define rspamd_url_tld_unsafe(u) ((u)->string + (u)->tldshift)
  78. #define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift)
  79. #define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift)
  80. #define rspamd_url_fragment_unsafe(u) ((u)->string + (u)->fragmentshift)
  81. enum uri_errno {
  82. URI_ERRNO_OK = 0, /* Parsing went well */
  83. URI_ERRNO_EMPTY, /* The URI string was empty */
  84. URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
  85. URI_ERRNO_INVALID_PORT, /* Port number is bad */
  86. URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */
  87. URI_ERRNO_BAD_FORMAT,
  88. URI_ERRNO_TLD_MISSING,
  89. URI_ERRNO_HOST_MISSING,
  90. URI_ERRNO_TOO_LONG,
  91. };
  92. enum rspamd_url_protocol {
  93. PROTOCOL_FILE = 1u << 0u,
  94. PROTOCOL_FTP = 1u << 1u,
  95. PROTOCOL_HTTP = 1u << 2u,
  96. PROTOCOL_HTTPS = 1u << 3u,
  97. PROTOCOL_MAILTO = 1u << 4u,
  98. PROTOCOL_TELEPHONE = 1u << 5u,
  99. PROTOCOL_UNKNOWN = 1u << 7u,
  100. };
  101. enum rspamd_url_parse_flags {
  102. RSPAMD_URL_PARSE_TEXT = 0u,
  103. RSPAMD_URL_PARSE_HREF = (1u << 0u),
  104. RSPAMD_URL_PARSE_CHECK = (1u << 1u),
  105. };
  106. enum rspamd_url_find_type {
  107. RSPAMD_URL_FIND_ALL = 0,
  108. RSPAMD_URL_FIND_STRICT,
  109. };
  110. /**
  111. * Initialize url library
  112. * @param cfg
  113. */
  114. void rspamd_url_init(const gchar *tld_file);
  115. void rspamd_url_deinit(void);
  116. /*
  117. * Parse urls inside text
  118. * @param pool memory pool
  119. * @param task task object
  120. * @param part current text part
  121. * @param is_html turn on html heuristic
  122. */
  123. void rspamd_url_text_extract(rspamd_mempool_t *pool,
  124. struct rspamd_task *task,
  125. struct rspamd_mime_text_part *part,
  126. enum rspamd_url_find_type how);
  127. /*
  128. * Parse a single url into an uri structure
  129. * @param pool memory pool
  130. * @param uristring text form of url
  131. * @param uri url object, must be pre allocated
  132. */
  133. enum uri_errno rspamd_url_parse(struct rspamd_url *uri,
  134. gchar *uristring,
  135. gsize len,
  136. rspamd_mempool_t *pool,
  137. enum rspamd_url_parse_flags flags);
  138. /*
  139. * Try to extract url from a text
  140. * @param pool memory pool
  141. * @param begin begin of text
  142. * @param len length of text
  143. * @param start storage for start position of url found (or NULL)
  144. * @param end storage for end position of url found (or NULL)
  145. * @param url_str storage for url string(or NULL)
  146. * @return TRUE if url is found in specified text
  147. */
  148. gboolean rspamd_url_find(rspamd_mempool_t *pool,
  149. const gchar *begin, gsize len,
  150. gchar **url_str,
  151. enum rspamd_url_find_type how,
  152. goffset *url_pos,
  153. gboolean *prefix_added);
  154. /*
  155. * Return text representation of url parsing error
  156. */
  157. const gchar *rspamd_url_strerror(int err);
  158. /**
  159. * Find TLD for a specified host string
  160. * @param in input host
  161. * @param inlen length of input
  162. * @param out output rspamd_ftok_t with tld position
  163. * @return TRUE if tld has been found
  164. */
  165. gboolean rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out);
  166. typedef gboolean (*url_insert_function)(struct rspamd_url *url,
  167. gsize start_offset, gsize end_offset, void *ud);
  168. /**
  169. * Search for multiple urls in text and call `func` for each url found
  170. * @param pool
  171. * @param in
  172. * @param inlen
  173. * @param is_html
  174. * @param func
  175. * @param ud
  176. */
  177. void rspamd_url_find_multiple(rspamd_mempool_t *pool,
  178. const gchar *in, gsize inlen,
  179. enum rspamd_url_find_type how,
  180. GPtrArray *nlines,
  181. url_insert_function func,
  182. gpointer ud);
  183. /**
  184. * Search for a single url in text and call `func` for each url found
  185. * @param pool
  186. * @param in
  187. * @param inlen
  188. * @param is_html
  189. * @param func
  190. * @param ud
  191. */
  192. void rspamd_url_find_single(rspamd_mempool_t *pool,
  193. const gchar *in, gsize inlen,
  194. enum rspamd_url_find_type how,
  195. url_insert_function func,
  196. gpointer ud);
  197. /**
  198. * Generic callback to insert URLs into rspamd_task
  199. * @param url
  200. * @param start_offset
  201. * @param end_offset
  202. * @param ud
  203. */
  204. gboolean rspamd_url_task_subject_callback(struct rspamd_url *url,
  205. gsize start_offset,
  206. gsize end_offset, gpointer ud);
  207. /**
  208. * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
  209. * @param dst
  210. * @param src
  211. * @param size
  212. * @return
  213. */
  214. gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size);
  215. /**
  216. * Encode url if needed. In this case, memory is allocated from the specific pool.
  217. * Returns pointer to begin and encoded length in `dlen`
  218. * @param url
  219. * @param pool
  220. * @return
  221. */
  222. const gchar *rspamd_url_encode(struct rspamd_url *url, gsize *dlen,
  223. rspamd_mempool_t *pool);
  224. /**
  225. * Returns if a character is domain character
  226. * @param c
  227. * @return
  228. */
  229. gboolean rspamd_url_is_domain(int c);
  230. /**
  231. * Returns symbolic name for protocol
  232. * @param proto
  233. * @return
  234. */
  235. const gchar *rspamd_url_protocol_name(enum rspamd_url_protocol proto);
  236. /**
  237. * Converts string to a numeric protocol
  238. * @param str
  239. * @return
  240. */
  241. enum rspamd_url_protocol rspamd_url_protocol_from_string(const gchar *str);
  242. /**
  243. * Converts string to a url flag
  244. * @param str
  245. * @param flag
  246. * @return
  247. */
  248. bool rspamd_url_flag_from_string(const gchar *str, gint *flag);
  249. /**
  250. * Converts url flag to a string
  251. * @param flag
  252. * @return
  253. */
  254. const gchar *rspamd_url_flag_to_string(int flag);
  255. /* Defines sets of urls indexed by url as is */
  256. KHASH_DECLARE (rspamd_url_hash, struct rspamd_url *, char);
  257. KHASH_DECLARE (rspamd_url_host_hash, struct rspamd_url *, char);
  258. /* Convenience functions for url sets */
  259. /**
  260. * Add an url to set or increase the existing url count
  261. * @param set
  262. * @param u
  263. * @return true if a new url has been added
  264. */
  265. bool rspamd_url_set_add_or_increase(khash_t (rspamd_url_hash) *set,
  266. struct rspamd_url *u,
  267. bool enforce_replace);
  268. /**
  269. * Same as rspamd_url_set_add_or_increase but returns the existing url if found
  270. * @param set
  271. * @param u
  272. * @return
  273. */
  274. struct rspamd_url *rspamd_url_set_add_or_return(khash_t (rspamd_url_hash) *set,
  275. struct rspamd_url *u);
  276. /**
  277. * Helper for url host set
  278. * @param set
  279. * @param u
  280. * @return
  281. */
  282. bool rspamd_url_host_set_add(khash_t (rspamd_url_host_hash) *set,
  283. struct rspamd_url *u);
  284. /**
  285. * Checks if a url is in set
  286. * @param set
  287. * @param u
  288. * @return
  289. */
  290. bool rspamd_url_set_has(khash_t (rspamd_url_hash) *set, struct rspamd_url *u);
  291. bool rspamd_url_host_set_has(khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u);
  292. /**
  293. * Compares two urls (similar to C comparison functions) lexicographically
  294. * @param u1
  295. * @param u2
  296. * @return
  297. */
  298. int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2);
  299. /**
  300. * Same but used for qsort to sort `struct rspamd_url *[]` array
  301. * @param u1
  302. * @param u2
  303. * @return
  304. */
  305. int rspamd_url_cmp_qsort(const void *u1, const void *u2);
  306. /**
  307. * Normalize unicode input and set out url flags as appropriate
  308. * @param pool
  309. * @param input
  310. * @param len_out (must be &var)
  311. * @param url_flags_out (must be just a var with no dereference)
  312. */
  313. #define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \
  314. do { \
  315. enum rspamd_normalise_result norm_res; \
  316. norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \
  317. if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \
  318. url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \
  319. } \
  320. if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { \
  321. url_flags_out |= RSPAMD_URL_FLAG_ZW_SPACES; \
  322. } \
  323. if (norm_res & (RSPAMD_UNICODE_NORM_ERROR)) { \
  324. url_flags_out |= RSPAMD_URL_FLAG_OBSCURED; \
  325. } \
  326. } while(0)
  327. #ifdef __cplusplus
  328. }
  329. #endif
  330. #endif