You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

url.h 6.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. /* URL check functions */
  2. #ifndef URL_H
  3. #define URL_H
  4. #include "config.h"
  5. #include "mem_pool.h"
  6. #include "fstring.h"
  7. #ifdef __cplusplus
  8. extern "C" {
  9. #endif
  10. struct rspamd_task;
  11. struct rspamd_mime_text_part;
  12. enum rspamd_url_flags {
  13. RSPAMD_URL_FLAG_PHISHED = 1u << 0u,
  14. RSPAMD_URL_FLAG_NUMERIC = 1u << 1u,
  15. RSPAMD_URL_FLAG_OBSCURED = 1u << 2u,
  16. RSPAMD_URL_FLAG_REDIRECTED = 1u << 3u,
  17. RSPAMD_URL_FLAG_HTML_DISPLAYED = 1u << 4u,
  18. RSPAMD_URL_FLAG_FROM_TEXT = 1u << 5u,
  19. RSPAMD_URL_FLAG_SUBJECT = 1u << 6u,
  20. RSPAMD_URL_FLAG_HOSTENCODED = 1u << 7u,
  21. RSPAMD_URL_FLAG_SCHEMAENCODED = 1u << 8u,
  22. RSPAMD_URL_FLAG_PATHENCODED = 1u << 9u,
  23. RSPAMD_URL_FLAG_QUERYENCODED = 1u << 10u,
  24. RSPAMD_URL_FLAG_MISSINGSLASHES = 1u << 11u,
  25. RSPAMD_URL_FLAG_IDN = 1u << 12u,
  26. RSPAMD_URL_FLAG_HAS_PORT = 1u << 13u,
  27. RSPAMD_URL_FLAG_HAS_USER = 1u << 14u,
  28. RSPAMD_URL_FLAG_SCHEMALESS = 1u << 15u,
  29. RSPAMD_URL_FLAG_UNNORMALISED = 1u << 16u,
  30. RSPAMD_URL_FLAG_ZW_SPACES = 1u << 17u,
  31. RSPAMD_URL_FLAG_DISPLAY_URL = 1u << 18u,
  32. };
  33. struct rspamd_url_tag {
  34. const gchar *data;
  35. struct rspamd_url_tag *prev, *next;
  36. };
  37. struct rspamd_url {
  38. gchar *raw;
  39. gchar *string;
  40. guint protocol;
  41. guint port;
  42. gchar *user;
  43. gchar *host;
  44. gchar *data;
  45. gchar *query;
  46. gchar *fragment;
  47. gchar *surbl;
  48. gchar *tld;
  49. gchar *visible_part;
  50. struct rspamd_url *phished_url;
  51. guint protocollen;
  52. guint userlen;
  53. guint hostlen;
  54. guint datalen;
  55. guint querylen;
  56. guint fragmentlen;
  57. guint surbllen;
  58. guint tldlen;
  59. guint urllen;
  60. guint rawlen;
  61. enum rspamd_url_flags flags;
  62. guint count;
  63. };
  64. enum uri_errno {
  65. URI_ERRNO_OK = 0, /* Parsing went well */
  66. URI_ERRNO_EMPTY, /* The URI string was empty */
  67. URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
  68. URI_ERRNO_INVALID_PORT, /* Port number is bad */
  69. URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */
  70. URI_ERRNO_BAD_FORMAT,
  71. URI_ERRNO_TLD_MISSING,
  72. URI_ERRNO_HOST_MISSING
  73. };
  74. enum rspamd_url_protocol {
  75. PROTOCOL_FILE = 1u << 0u,
  76. PROTOCOL_FTP = 1u << 1u,
  77. PROTOCOL_HTTP = 1u << 2u,
  78. PROTOCOL_HTTPS = 1u << 3u,
  79. PROTOCOL_MAILTO = 1u << 4u,
  80. PROTOCOL_TELEPHONE = 1u << 5u,
  81. PROTOCOL_UNKNOWN = 1u << 31u,
  82. };
  83. enum rspamd_url_parse_flags {
  84. RSPAMD_URL_PARSE_TEXT = 0u,
  85. RSPAMD_URL_PARSE_HREF = (1u << 0u),
  86. RSPAMD_URL_PARSE_CHECK = (1u << 1u),
  87. };
  88. enum rspamd_url_find_type {
  89. RSPAMD_URL_FIND_ALL = 0,
  90. RSPAMD_URL_FIND_STRICT,
  91. };
  92. /**
  93. * Initialize url library
  94. * @param cfg
  95. */
  96. void rspamd_url_init (const gchar *tld_file);
  97. void rspamd_url_deinit (void);
  98. /*
  99. * Parse urls inside text
  100. * @param pool memory pool
  101. * @param task task object
  102. * @param part current text part
  103. * @param is_html turn on html euristic
  104. */
  105. void rspamd_url_text_extract (rspamd_mempool_t *pool,
  106. struct rspamd_task *task,
  107. struct rspamd_mime_text_part *part,
  108. enum rspamd_url_find_type how);
  109. /*
  110. * Parse a single url into an uri structure
  111. * @param pool memory pool
  112. * @param uristring text form of url
  113. * @param uri url object, must be pre allocated
  114. */
  115. enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
  116. gchar *uristring,
  117. gsize len,
  118. rspamd_mempool_t *pool,
  119. enum rspamd_url_parse_flags flags);
  120. /*
  121. * Try to extract url from a text
  122. * @param pool memory pool
  123. * @param begin begin of text
  124. * @param len length of text
  125. * @param start storage for start position of url found (or NULL)
  126. * @param end storage for end position of url found (or NULL)
  127. * @param url_str storage for url string(or NULL)
  128. * @return TRUE if url is found in specified text
  129. */
  130. gboolean rspamd_url_find (rspamd_mempool_t *pool,
  131. const gchar *begin, gsize len,
  132. gchar **url_str,
  133. enum rspamd_url_find_type how,
  134. goffset *url_pos,
  135. gboolean *prefix_added);
  136. /*
  137. * Return text representation of url parsing error
  138. */
  139. const gchar *rspamd_url_strerror (int err);
  140. /**
  141. * Find TLD for a specified host string
  142. * @param in input host
  143. * @param inlen length of input
  144. * @param out output rspamd_ftok_t with tld position
  145. * @return TRUE if tld has been found
  146. */
  147. gboolean rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out);
  148. typedef void (*url_insert_function) (struct rspamd_url *url,
  149. gsize start_offset, gsize end_offset, void *ud);
  150. /**
  151. * Search for multiple urls in text and call `func` for each url found
  152. * @param pool
  153. * @param in
  154. * @param inlen
  155. * @param is_html
  156. * @param func
  157. * @param ud
  158. */
  159. void rspamd_url_find_multiple (rspamd_mempool_t *pool,
  160. const gchar *in, gsize inlen,
  161. enum rspamd_url_find_type how,
  162. GPtrArray *nlines,
  163. url_insert_function func,
  164. gpointer ud);
  165. /**
  166. * Search for a single url in text and call `func` for each url found
  167. * @param pool
  168. * @param in
  169. * @param inlen
  170. * @param is_html
  171. * @param func
  172. * @param ud
  173. */
  174. void rspamd_url_find_single (rspamd_mempool_t *pool,
  175. const gchar *in, gsize inlen,
  176. enum rspamd_url_find_type how,
  177. url_insert_function func,
  178. gpointer ud);
  179. /**
  180. * Generic callback to insert URLs into rspamd_task
  181. * @param url
  182. * @param start_offset
  183. * @param end_offset
  184. * @param ud
  185. */
  186. void rspamd_url_task_subject_callback (struct rspamd_url *url,
  187. gsize start_offset,
  188. gsize end_offset, gpointer ud);
  189. guint rspamd_url_hash (gconstpointer u);
  190. guint rspamd_email_hash (gconstpointer u);
  191. guint rspamd_url_host_hash (gconstpointer u);
  192. /* Compare two emails for building emails hash */
  193. gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b);
  194. /* Compare two urls for building emails hash */
  195. gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b);
  196. gboolean rspamd_urls_host_cmp (gconstpointer a, gconstpointer b);
  197. /**
  198. * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
  199. * @param dst
  200. * @param src
  201. * @param size
  202. * @return
  203. */
  204. gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
  205. /**
  206. * Encode url if needed. In this case, memory is allocated from the specific pool.
  207. * Returns pointer to begin and encoded length in `dlen`
  208. * @param url
  209. * @param pool
  210. * @return
  211. */
  212. const gchar *rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
  213. rspamd_mempool_t *pool);
  214. /**
  215. * Returns if a character is domain character
  216. * @param c
  217. * @return
  218. */
  219. gboolean rspamd_url_is_domain (int c);
  220. /**
  221. * Returns symbolic name for protocol
  222. * @param proto
  223. * @return
  224. */
  225. const gchar *rspamd_url_protocol_name (enum rspamd_url_protocol proto);
  226. /**
  227. * Converts string to a numeric protocol
  228. * @param str
  229. * @return
  230. */
  231. enum rspamd_url_protocol rspamd_url_protocol_from_string (const gchar *str);
  232. #ifdef __cplusplus
  233. }
  234. #endif
  235. #endif