You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

url.h 8.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. /* URL check functions */
  2. #ifndef URL_H
  3. #define URL_H
  4. #include "config.h"
  5. #include "mem_pool.h"
  6. #include "khash.h"
  7. #include "fstring.h"
  8. #ifdef __cplusplus
  9. extern "C" {
  10. #endif
  11. struct rspamd_task;
  12. struct rspamd_mime_text_part;
  13. enum rspamd_url_flags {
  14. RSPAMD_URL_FLAG_PHISHED = 1u << 0u,
  15. RSPAMD_URL_FLAG_NUMERIC = 1u << 1u,
  16. RSPAMD_URL_FLAG_OBSCURED = 1u << 2u,
  17. RSPAMD_URL_FLAG_REDIRECTED = 1u << 3u,
  18. RSPAMD_URL_FLAG_HTML_DISPLAYED = 1u << 4u,
  19. RSPAMD_URL_FLAG_FROM_TEXT = 1u << 5u,
  20. RSPAMD_URL_FLAG_SUBJECT = 1u << 6u,
  21. RSPAMD_URL_FLAG_HOSTENCODED = 1u << 7u,
  22. RSPAMD_URL_FLAG_SCHEMAENCODED = 1u << 8u,
  23. RSPAMD_URL_FLAG_PATHENCODED = 1u << 9u,
  24. RSPAMD_URL_FLAG_QUERYENCODED = 1u << 10u,
  25. RSPAMD_URL_FLAG_MISSINGSLASHES = 1u << 11u,
  26. RSPAMD_URL_FLAG_IDN = 1u << 12u,
  27. RSPAMD_URL_FLAG_HAS_PORT = 1u << 13u,
  28. RSPAMD_URL_FLAG_HAS_USER = 1u << 14u,
  29. RSPAMD_URL_FLAG_SCHEMALESS = 1u << 15u,
  30. RSPAMD_URL_FLAG_UNNORMALISED = 1u << 16u,
  31. RSPAMD_URL_FLAG_ZW_SPACES = 1u << 17u,
  32. RSPAMD_URL_FLAG_DISPLAY_URL = 1u << 18u,
  33. RSPAMD_URL_FLAG_IMAGE = 1u << 19u,
  34. RSPAMD_URL_FLAG_QUERY = 1u << 20u,
  35. RSPAMD_URL_FLAG_CONTENT = 1u << 21u,
  36. RSPAMD_URL_FLAG_NO_TLD = 1u << 22u,
  37. };
  38. struct rspamd_url_tag {
  39. const gchar *data;
  40. struct rspamd_url_tag *prev, *next;
  41. };
  42. struct rspamd_url {
  43. gchar *raw;
  44. gchar *string;
  45. guint16 protocol;
  46. guint16 port;
  47. guint usershift;
  48. guint hostshift;
  49. guint datashift;
  50. guint queryshift;
  51. guint fragmentshift;
  52. guint tldshift;
  53. guint16 protocollen;
  54. guint16 userlen;
  55. guint16 hostlen;
  56. guint16 datalen;
  57. guint16 querylen;
  58. guint16 fragmentlen;
  59. guint16 tldlen;
  60. guint16 count;
  61. guint urllen;
  62. guint rawlen;
  63. guint32 flags;
  64. gchar *visible_part;
  65. struct rspamd_url *phished_url;
  66. };
  67. #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
  68. #define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift)
  69. #define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
  70. #define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
  71. #define rspamd_url_tld_unsafe(u) ((u)->string + (u)->tldshift)
  72. #define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift)
  73. #define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift)
  74. #define rspamd_url_fragment_unsafe(u) ((u)->string + (u)->fragmentshift)
  75. enum uri_errno {
  76. URI_ERRNO_OK = 0, /* Parsing went well */
  77. URI_ERRNO_EMPTY, /* The URI string was empty */
  78. URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
  79. URI_ERRNO_INVALID_PORT, /* Port number is bad */
  80. URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */
  81. URI_ERRNO_BAD_FORMAT,
  82. URI_ERRNO_TLD_MISSING,
  83. URI_ERRNO_HOST_MISSING
  84. };
  85. enum rspamd_url_protocol {
  86. PROTOCOL_FILE = 1u << 0u,
  87. PROTOCOL_FTP = 1u << 1u,
  88. PROTOCOL_HTTP = 1u << 2u,
  89. PROTOCOL_HTTPS = 1u << 3u,
  90. PROTOCOL_MAILTO = 1u << 4u,
  91. PROTOCOL_TELEPHONE = 1u << 5u,
  92. PROTOCOL_UNKNOWN = 1u << 15u,
  93. };
  94. enum rspamd_url_parse_flags {
  95. RSPAMD_URL_PARSE_TEXT = 0u,
  96. RSPAMD_URL_PARSE_HREF = (1u << 0u),
  97. RSPAMD_URL_PARSE_CHECK = (1u << 1u),
  98. };
  99. enum rspamd_url_find_type {
  100. RSPAMD_URL_FIND_ALL = 0,
  101. RSPAMD_URL_FIND_STRICT,
  102. };
  103. /**
  104. * Initialize url library
  105. * @param cfg
  106. */
  107. void rspamd_url_init (const gchar *tld_file);
  108. void rspamd_url_deinit (void);
  109. /*
  110. * Parse urls inside text
  111. * @param pool memory pool
  112. * @param task task object
  113. * @param part current text part
  114. * @param is_html turn on html euristic
  115. */
  116. void rspamd_url_text_extract (rspamd_mempool_t *pool,
  117. struct rspamd_task *task,
  118. struct rspamd_mime_text_part *part,
  119. enum rspamd_url_find_type how);
  120. /*
  121. * Parse a single url into an uri structure
  122. * @param pool memory pool
  123. * @param uristring text form of url
  124. * @param uri url object, must be pre allocated
  125. */
  126. enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
  127. gchar *uristring,
  128. gsize len,
  129. rspamd_mempool_t *pool,
  130. enum rspamd_url_parse_flags flags);
  131. /*
  132. * Try to extract url from a text
  133. * @param pool memory pool
  134. * @param begin begin of text
  135. * @param len length of text
  136. * @param start storage for start position of url found (or NULL)
  137. * @param end storage for end position of url found (or NULL)
  138. * @param url_str storage for url string(or NULL)
  139. * @return TRUE if url is found in specified text
  140. */
  141. gboolean rspamd_url_find (rspamd_mempool_t *pool,
  142. const gchar *begin, gsize len,
  143. gchar **url_str,
  144. enum rspamd_url_find_type how,
  145. goffset *url_pos,
  146. gboolean *prefix_added);
  147. /*
  148. * Return text representation of url parsing error
  149. */
  150. const gchar *rspamd_url_strerror (int err);
  151. /**
  152. * Find TLD for a specified host string
  153. * @param in input host
  154. * @param inlen length of input
  155. * @param out output rspamd_ftok_t with tld position
  156. * @return TRUE if tld has been found
  157. */
  158. gboolean rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out);
  159. typedef gboolean (*url_insert_function) (struct rspamd_url *url,
  160. gsize start_offset, gsize end_offset, void *ud);
  161. /**
  162. * Search for multiple urls in text and call `func` for each url found
  163. * @param pool
  164. * @param in
  165. * @param inlen
  166. * @param is_html
  167. * @param func
  168. * @param ud
  169. */
  170. void rspamd_url_find_multiple (rspamd_mempool_t *pool,
  171. const gchar *in, gsize inlen,
  172. enum rspamd_url_find_type how,
  173. GPtrArray *nlines,
  174. url_insert_function func,
  175. gpointer ud);
  176. /**
  177. * Search for a single url in text and call `func` for each url found
  178. * @param pool
  179. * @param in
  180. * @param inlen
  181. * @param is_html
  182. * @param func
  183. * @param ud
  184. */
  185. void rspamd_url_find_single (rspamd_mempool_t *pool,
  186. const gchar *in, gsize inlen,
  187. enum rspamd_url_find_type how,
  188. url_insert_function func,
  189. gpointer ud);
  190. /**
  191. * Generic callback to insert URLs into rspamd_task
  192. * @param url
  193. * @param start_offset
  194. * @param end_offset
  195. * @param ud
  196. */
  197. gboolean rspamd_url_task_subject_callback (struct rspamd_url *url,
  198. gsize start_offset,
  199. gsize end_offset, gpointer ud);
  200. /**
  201. * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
  202. * @param dst
  203. * @param src
  204. * @param size
  205. * @return
  206. */
  207. gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
  208. /**
  209. * Encode url if needed. In this case, memory is allocated from the specific pool.
  210. * Returns pointer to begin and encoded length in `dlen`
  211. * @param url
  212. * @param pool
  213. * @return
  214. */
  215. const gchar *rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
  216. rspamd_mempool_t *pool);
  217. /**
  218. * Returns if a character is domain character
  219. * @param c
  220. * @return
  221. */
  222. gboolean rspamd_url_is_domain (int c);
  223. /**
  224. * Returns symbolic name for protocol
  225. * @param proto
  226. * @return
  227. */
  228. const gchar *rspamd_url_protocol_name (enum rspamd_url_protocol proto);
  229. /**
  230. * Converts string to a numeric protocol
  231. * @param str
  232. * @return
  233. */
  234. enum rspamd_url_protocol rspamd_url_protocol_from_string (const gchar *str);
  235. /**
  236. * Converts string to a url flag
  237. * @param str
  238. * @param flag
  239. * @return
  240. */
  241. bool rspamd_url_flag_from_string (const gchar *str, gint *flag);
  242. /**
  243. * Converts url flag to a string
  244. * @param flag
  245. * @return
  246. */
  247. const gchar * rspamd_url_flag_to_string (int flag);
  248. /* Defines sets of urls indexed by url as is */
  249. KHASH_DECLARE (rspamd_url_hash, struct rspamd_url *, char);
  250. KHASH_DECLARE (rspamd_url_host_hash, struct rspamd_url *, char);
  251. /* Convenience functions for url sets */
  252. /**
  253. * Add an url to set or increase the existing url count
  254. * @param set
  255. * @param u
  256. * @return true if a new url has been added
  257. */
  258. bool rspamd_url_set_add_or_increase(khash_t (rspamd_url_hash) *set,
  259. struct rspamd_url *u,
  260. bool enforce_replace);
  261. /**
  262. * Same as rspamd_url_set_add_or_increase but returns the existing url if found
  263. * @param set
  264. * @param u
  265. * @return
  266. */
  267. struct rspamd_url * rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set,
  268. struct rspamd_url *u);
  269. /**
  270. * Helper for url host set
  271. * @param set
  272. * @param u
  273. * @return
  274. */
  275. bool rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
  276. struct rspamd_url *u);
  277. /**
  278. * Checks if a url is in set
  279. * @param set
  280. * @param u
  281. * @return
  282. */
  283. bool rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u);
  284. bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u);
  285. /**
  286. * Compares two urls (similar to C comparison functions) lexicographically
  287. * @param u1
  288. * @param u2
  289. * @return
  290. */
  291. int rspamd_url_cmp (const struct rspamd_url *u1, const struct rspamd_url *u2);
  292. /**
  293. * Same but used for qsort to sort `struct rspamd_url *[]` array
  294. * @param u1
  295. * @param u2
  296. * @return
  297. */
  298. int rspamd_url_cmp_qsort (const void *u1, const void *u2);
  299. #ifdef __cplusplus
  300. }
  301. #endif
  302. #endif