Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. /* URL check functions */
  2. #ifndef URL_H
  3. #define URL_H
  4. #include "config.h"
  5. #include "mem_pool.h"
  6. #include "fstring.h"
  7. struct rspamd_task;
  8. struct rspamd_mime_text_part;
  9. enum rspamd_url_flags {
  10. RSPAMD_URL_FLAG_PHISHED = 1 << 0,
  11. RSPAMD_URL_FLAG_NUMERIC = 1 << 1,
  12. RSPAMD_URL_FLAG_OBSCURED = 1 << 2,
  13. RSPAMD_URL_FLAG_REDIRECTED = 1 << 3,
  14. RSPAMD_URL_FLAG_HTML_DISPLAYED = 1 << 4,
  15. RSPAMD_URL_FLAG_FROM_TEXT = 1 << 5,
  16. RSPAMD_URL_FLAG_SUBJECT = 1 << 6,
  17. RSPAMD_URL_FLAG_HOSTENCODED = 1 << 7,
  18. RSPAMD_URL_FLAG_SCHEMAENCODED = 1 << 8,
  19. RSPAMD_URL_FLAG_PATHENCODED = 1 << 9,
  20. RSPAMD_URL_FLAG_QUERYENCODED = 1 << 10,
  21. RSPAMD_URL_FLAG_MISSINGSLASHES = 1 << 11,
  22. RSPAMD_URL_FLAG_IDN = 1 << 12,
  23. RSPAMD_URL_FLAG_HAS_PORT = 1 << 13,
  24. RSPAMD_URL_FLAG_HAS_USER = 1 << 14,
  25. RSPAMD_URL_FLAG_SCHEMALESS = 1 << 15,
  26. RSPAMD_URL_FLAG_UNNORMALISED = 1 << 16,
  27. RSPAMD_URL_FLAG_ZW_SPACES = 1 << 17,
  28. };
  29. struct rspamd_url_tag {
  30. const gchar *data;
  31. struct rspamd_url_tag *prev, *next;
  32. };
  33. struct rspamd_url {
  34. gchar *raw;
  35. gchar *string;
  36. gint protocol;
  37. guint port;
  38. gchar *user;
  39. gchar *host;
  40. gchar *data;
  41. gchar *query;
  42. gchar *fragment;
  43. gchar *surbl;
  44. gchar *tld;
  45. struct rspamd_url *phished_url;
  46. guint protocollen;
  47. guint userlen;
  48. guint hostlen;
  49. guint datalen;
  50. guint querylen;
  51. guint fragmentlen;
  52. guint surbllen;
  53. guint tldlen;
  54. guint urllen;
  55. guint rawlen;
  56. enum rspamd_url_flags flags;
  57. guint count;
  58. GHashTable *tags;
  59. };
  60. enum uri_errno {
  61. URI_ERRNO_OK = 0, /* Parsing went well */
  62. URI_ERRNO_EMPTY, /* The URI string was empty */
  63. URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
  64. URI_ERRNO_INVALID_PORT, /* Port number is bad */
  65. URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */
  66. URI_ERRNO_BAD_FORMAT,
  67. URI_ERRNO_TLD_MISSING,
  68. URI_ERRNO_HOST_MISSING
  69. };
  70. enum rspamd_url_protocol {
  71. PROTOCOL_FILE = 0,
  72. PROTOCOL_FTP,
  73. PROTOCOL_HTTP,
  74. PROTOCOL_HTTPS,
  75. PROTOCOL_MAILTO,
  76. PROTOCOL_TELEPHONE,
  77. PROTOCOL_UNKNOWN
  78. };
  79. /**
  80. * Initialize url library
  81. * @param cfg
  82. */
  83. void rspamd_url_init (const gchar *tld_file);
  84. void rspamd_url_deinit (void);
  85. /*
  86. * Parse urls inside text
  87. * @param pool memory pool
  88. * @param task task object
  89. * @param part current text part
  90. * @param is_html turn on html euristic
  91. */
  92. void rspamd_url_text_extract (rspamd_mempool_t *pool,
  93. struct rspamd_task *task,
  94. struct rspamd_mime_text_part *part,
  95. gboolean is_html);
  96. enum rspamd_url_parse_flags {
  97. RSPAMD_URL_PARSE_TEXT = 0,
  98. RSPAMD_URL_PARSE_HREF = (1u << 0),
  99. RSPAMD_URL_PARSE_CHECK = (1 << 1),
  100. };
  101. /*
  102. * Parse a single url into an uri structure
  103. * @param pool memory pool
  104. * @param uristring text form of url
  105. * @param uri url object, must be pre allocated
  106. */
  107. enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
  108. gchar *uristring,
  109. gsize len,
  110. rspamd_mempool_t *pool,
  111. enum rspamd_url_parse_flags flags);
  112. /*
  113. * Try to extract url from a text
  114. * @param pool memory pool
  115. * @param begin begin of text
  116. * @param len length of text
  117. * @param start storage for start position of url found (or NULL)
  118. * @param end storage for end position of url found (or NULL)
  119. * @param url_str storage for url string(or NULL)
  120. * @return TRUE if url is found in specified text
  121. */
  122. gboolean rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
  123. gchar **url_str, gboolean is_html, goffset *url_pos,
  124. gboolean *prefix_added);
  125. /*
  126. * Return text representation of url parsing error
  127. */
  128. const gchar * rspamd_url_strerror (enum uri_errno err);
  129. /**
  130. * Find TLD for a specified host string
  131. * @param in input host
  132. * @param inlen length of input
  133. * @param out output rspamd_ftok_t with tld position
  134. * @return TRUE if tld has been found
  135. */
  136. gboolean rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out);
  137. typedef void (*url_insert_function) (struct rspamd_url *url,
  138. gsize start_offset, gsize end_offset, void *ud);
  139. /**
  140. * Search for multiple urls in text and call `func` for each url found
  141. * @param pool
  142. * @param in
  143. * @param inlen
  144. * @param is_html
  145. * @param func
  146. * @param ud
  147. */
  148. void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
  149. gsize inlen, gboolean is_html, GPtrArray *nlines,
  150. url_insert_function func, gpointer ud);
  151. /**
  152. * Search for a single url in text and call `func` for each url found
  153. * @param pool
  154. * @param in
  155. * @param inlen
  156. * @param is_html
  157. * @param func
  158. * @param ud
  159. */
  160. void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
  161. gsize inlen, gboolean is_html,
  162. url_insert_function func, gpointer ud);
  163. /**
  164. * Generic callback to insert URLs into rspamd_task
  165. * @param url
  166. * @param start_offset
  167. * @param end_offset
  168. * @param ud
  169. */
  170. void rspamd_url_task_subject_callback (struct rspamd_url *url,
  171. gsize start_offset,
  172. gsize end_offset, gpointer ud);
  173. /**
  174. * Adds a tag for url
  175. * @param url
  176. * @param tag
  177. * @param pool
  178. */
  179. void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,
  180. const gchar *value,
  181. rspamd_mempool_t *pool);
  182. guint rspamd_url_hash (gconstpointer u);
  183. guint rspamd_email_hash (gconstpointer u);
  184. guint rspamd_url_host_hash (gconstpointer u);
  185. /* Compare two emails for building emails hash */
  186. gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b);
  187. /* Compare two urls for building emails hash */
  188. gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b);
  189. gboolean rspamd_urls_host_cmp (gconstpointer a, gconstpointer b);
  190. /**
  191. * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
  192. * @param dst
  193. * @param src
  194. * @param size
  195. * @return
  196. */
  197. gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
  198. /**
  199. * Encode url if needed. In this case, memory is allocated from the specific pool.
  200. * Returns pointer to begin and encoded length in `dlen`
  201. * @param url
  202. * @param pool
  203. * @return
  204. */
  205. const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
  206. rspamd_mempool_t *pool);
  207. /**
  208. * Returns if a character is domain character
  209. * @param c
  210. * @return
  211. */
  212. gboolean rspamd_url_is_domain (int c);
  213. #endif