]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Urls: adopt html related stuff
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 7 Mar 2020 16:32:14 +0000 (16:32 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 9 Mar 2020 10:46:11 +0000 (10:46 +0000)
src/libmime/message.c
src/libserver/html.c
src/libserver/html.h
src/libserver/url.c
src/libserver/url.h

index 40b7fe8bcd291b1020620cbe148981b6628dd77a..c45550e6d0e8d6cca5db31963d8573abd39b2c1d 100644 (file)
@@ -758,8 +758,7 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
                        text_part->html,
                        text_part->utf_raw_content,
                        &text_part->exceptions,
-                       MESSAGE_FIELD (task, urls),
-                       MESSAGE_FIELD (task, emails));
+                       MESSAGE_FIELD (task, urls));
 
        if (text_part->utf_content->len == 0) {
                text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
index 981141ad8dddf39fe8a658921ec5935e9fc354c6..9b528257580071a89dec87c8759b323f6516636f 100644 (file)
@@ -1617,10 +1617,9 @@ rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 
 static void
 rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
-               GHashTable *tbl_urls, GHashTable *tbl_emails)
+                                                khash_t (rspamd_url_hash) *url_set)
 {
-       GHashTable *target_tbl;
-       struct rspamd_url *query_url, *existing;
+       struct rspamd_url *query_url;
        gchar *url_str;
        gint rc;
        gboolean prefix_added;
@@ -1648,13 +1647,6 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
                                msg_debug_html ("found url %s in query of url"
                                                " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url));
 
-                               if (query_url->protocol == PROTOCOL_MAILTO) {
-                                       target_tbl = tbl_emails;
-                               }
-                               else {
-                                       target_tbl = tbl_urls;
-                               }
-
                                if (prefix_added) {
                                        query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
                                }
@@ -1671,15 +1663,7 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
                                        query_url->flags |= RSPAMD_URL_FLAG_OBSCURED;
                                }
 
-                               if ((existing = g_hash_table_lookup (target_tbl,
-                                               query_url)) == NULL) {
-                                       g_hash_table_insert (target_tbl,
-                                                       query_url,
-                                                       query_url);
-                               }
-                               else {
-                                       existing->count ++;
-                               }
+                               rspamd_url_set_add_or_increase (url_set, query_url);
                        }
                }
        }
@@ -1739,7 +1723,7 @@ rspamd_html_process_data_image (rspamd_mempool_t *pool,
 
 static void
 rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
-               struct html_content *hc, GHashTable *urls)
+               struct html_content *hc, khash_t (rspamd_url_hash) *url_set)
 {
        struct html_tag_component *comp;
        struct html_image *img;
@@ -1784,17 +1768,8 @@ rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
                                                                img->src, fstr.len, NULL);
 
                                                if (img->url) {
-                                                       struct rspamd_url *turl = g_hash_table_lookup (urls,
-                                                                       img->url);
-
                                                        img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
-
-                                                       if (turl == NULL) {
-                                                               g_hash_table_insert (urls, img->url, img->url);
-                                                       }
-                                                       else {
-                                                               turl->count++;
-                                                       }
+                                                       rspamd_url_set_add_or_increase (url_set, img->url);
                                                }
                                        }
                                }
@@ -2449,10 +2424,11 @@ rspamd_html_process_block_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 
 static void
 rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
-               GList **exceptions, GHashTable *urls, GHashTable *emails,
-               GByteArray *dest, GHashTable *target_tbl,
-               gint href_offset,
-               struct rspamd_url *url)
+                                                                GList **exceptions,
+                                                                khash_t (rspamd_url_hash) *url_set,
+                                                                GByteArray *dest,
+                                                                gint href_offset,
+                                                                struct rspamd_url *url)
 {
        struct rspamd_url *displayed_url = NULL;
        struct rspamd_url *turl;
@@ -2477,6 +2453,7 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
        if (url_found) {
                url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
        }
+
        if (exceptions && url_found) {
                ex = rspamd_mempool_alloc (pool,
                                sizeof (*ex));
@@ -2489,39 +2466,27 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
                                ex);
        }
 
-       if (displayed_url) {
-               if (displayed_url->protocol ==
-                               PROTOCOL_MAILTO) {
-                       target_tbl = emails;
-               }
-               else {
-                       target_tbl = urls;
-               }
+       if (displayed_url && url_set) {
+               turl = rspamd_url_set_add_or_return (url_set,
+                               displayed_url);
 
-               if (target_tbl != NULL) {
-                       turl = g_hash_table_lookup (target_tbl,
-                                       displayed_url);
-
-                       if (turl != NULL) {
-                               /* Here, we assume the following:
-                                * if we have a URL in the text part which
-                                * is the same as displayed URL in the
-                                * HTML part, we assume that it is also
-                                * hint only.
-                                */
-                               if (turl->flags &
-                                               RSPAMD_URL_FLAG_FROM_TEXT) {
-                                       turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
-                                       turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
-                               }
-
-                               turl->count ++;
-                       }
-                       else {
-                               g_hash_table_insert (target_tbl,
-                                               displayed_url,
-                                               displayed_url);
+               if (turl != NULL) {
+                       /* Here, we assume the following:
+                        * if we have a URL in the text part which
+                        * is the same as displayed URL in the
+                        * HTML part, we assume that it is also
+                        * hint only.
+                        */
+                       if (turl->flags &
+                               RSPAMD_URL_FLAG_FROM_TEXT) {
+                               turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+                               turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
                        }
+
+                       turl->count ++;
+               }
+               else {
+                       /* Already inserted by `rspamd_url_set_add_or_return` */
                }
        }
 }
@@ -2625,20 +2590,22 @@ rspamd_html_propagate_style (struct html_content *hc,
 }
 
 GByteArray*
-rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
-               GByteArray *in, GList **exceptions, GHashTable *urls,  GHashTable *emails)
+rspamd_html_process_part_full (rspamd_mempool_t *pool,
+                                                          struct html_content *hc,
+                                                          GByteArray *in,
+                                                          GList **exceptions,
+                                                          khash_t (rspamd_url_hash) *url_set)
 {
        const guchar *p, *c, *end, *savep = NULL;
        guchar t;
        gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
                        balanced;
        GByteArray *dest;
-       GHashTable *target_tbl;
        guint obrace = 0, ebrace = 0;
        GNode *cur_level = NULL;
        gint substate = 0, len, href_offset = -1;
        struct html_tag *cur_tag = NULL, *content_tag = NULL;
-       struct rspamd_url *url = NULL, *turl;
+       struct rspamd_url *url = NULL;
        GQueue *styles_blocks;
 
        enum {
@@ -3089,28 +3056,9 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
 
                                                if (url != NULL) {
 
-                                                       if (url->protocol == PROTOCOL_MAILTO) {
-                                                               target_tbl = emails;
-                                                       }
-                                                       else {
-                                                               target_tbl = urls;
-                                                       }
-
-                                                       if (target_tbl != NULL) {
-                                                               turl = g_hash_table_lookup (target_tbl, url);
-
-                                                               if (turl == NULL) {
-                                                                       g_hash_table_insert (target_tbl, url, url);
-                                                               }
-                                                               else {
-                                                                       turl->count ++;
-                                                                       url = NULL;
-                                                               }
-
-                                                               if (turl == NULL && url != NULL) {
-                                                                       rspamd_process_html_url (pool,
-                                                                                       url,
-                                                                                       urls, emails);
+                                                       if (url_set != NULL) {
+                                                               if (!rspamd_url_set_add_or_increase (url_set, url)) {
+                                                                       rspamd_process_html_url (pool, url, url_set);
                                                                }
                                                        }
 
@@ -3131,8 +3079,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                                prev_url = prev_tag->extra;
 
                                                                rspamd_html_check_displayed_url (pool,
-                                                                               exceptions, urls, emails,
-                                                                               dest, target_tbl, href_offset,
+                                                                               exceptions, url_set,
+                                                                               dest, href_offset,
                                                                                prev_url);
                                                        }
                                                }
@@ -3142,8 +3090,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                        /* Insert exception */
                                                        if (url != NULL && (gint) dest->len > href_offset) {
                                                                rspamd_html_check_displayed_url (pool,
-                                                                               exceptions, urls, emails,
-                                                                               dest, target_tbl, href_offset,
+                                                                               exceptions, url_set,
+                                                                               dest, href_offset,
                                                                                url);
 
                                                        }
@@ -3172,7 +3120,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                }
 
                                if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
-                                       rspamd_html_process_img_tag (pool, cur_tag, hc, urls);
+                                       rspamd_html_process_img_tag (pool, cur_tag, hc, url_set);
                                }
                                else if (cur_tag->flags & FL_BLOCK) {
                                        struct html_block *bl;
@@ -3237,5 +3185,5 @@ rspamd_html_process_part (rspamd_mempool_t *pool,
                struct html_content *hc,
                GByteArray *in)
 {
-       return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
+       return rspamd_html_process_part_full (pool, hc, in, NULL, NULL);
 }
index ee5c242cb18ab703a8fbe877b3a686a26afc0994..051df5b63019671cc63536d3ab154b1421c62b61 100644 (file)
@@ -6,7 +6,8 @@
 #define RSPAMD_HTML_H
 
 #include "config.h"
-#include "mem_pool.h"
+#include "libutil/mem_pool.h"
+#include "libserver/url.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -142,7 +143,7 @@ GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool,
 GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
                                                                                   struct html_content *hc,
                                                                                   GByteArray *in, GList **exceptions,
-                                                                                  GHashTable *urls, GHashTable *emails);
+                                                                                  khash_t (rspamd_url_hash) *url_set);
 
 /*
  * Returns true if a specified tag has been seen in a part
index 505d1d15057cd78a2a98fba40786910ad8e716f4..39ea5acc2c31edfdffea1933aff73d68fdaf6e55 100644 (file)
@@ -3297,7 +3297,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
 {
        struct rspamd_task *task = ud;
        gchar *url_str = NULL;
-       struct rspamd_url *query_url, *existing;
+       struct rspamd_url *query_url;
        gint rc;
        gboolean prefix_added;
 
@@ -3781,6 +3781,26 @@ rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
        return true;
 }
 
+struct rspamd_url *
+rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set,
+                                                               struct rspamd_url *u)
+{
+       khiter_t k;
+       gint r;
+
+       if (set) {
+               k = kh_put (rspamd_url_hash, set, u, &r);
+
+               if (r == 0) {
+                       struct rspamd_url *ex = kh_key (set, k);
+
+                       return ex;
+               }
+       }
+
+       return NULL;
+}
+
 bool
 rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
                                                                struct rspamd_url *u)
@@ -3788,13 +3808,17 @@ rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
        khiter_t k;
        gint r;
 
-       k = kh_put (rspamd_url_host_hash, set, u, &r);
+       if (set) {
+               k = kh_put (rspamd_url_host_hash, set, u, &r);
 
-       if (r == 0) {
-               return false;
+               if (r == 0) {
+                       return false;
+               }
+
+               return true;
        }
 
-       return true;
+       return false;
 }
 
 bool
@@ -3802,13 +3826,17 @@ rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u)
 {
        khiter_t k;
 
-       k = kh_get (rspamd_url_hash, set, u);
+       if (set) {
+               k = kh_get (rspamd_url_hash, set, u);
 
-       if (k == kh_end (set)) {
-               return false;
+               if (k == kh_end (set)) {
+                       return false;
+               }
+
+               return true;
        }
 
-       return true;
+       return false;
 }
 
 bool
@@ -3816,11 +3844,15 @@ rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url
 {
        khiter_t k;
 
-       k = kh_get (rspamd_url_hash, set, u);
+       if (set) {
+               k = kh_get (rspamd_url_host_hash, set, u);
 
-       if (k == kh_end (set)) {
-               return false;
+               if (k == kh_end (set)) {
+                       return false;
+               }
+
+               return true;
        }
 
-       return true;
+       return false;
 }
\ No newline at end of file
index aff7ccf5fc26dde00f78a812ae5660bb2adae71e..bf8ba4b6376171aae3070a69f8edb21ea79687f2 100644 (file)
@@ -280,6 +280,15 @@ KHASH_DECLARE (rspamd_url_host_hash, struct rspamd_url *, char);
  */
 bool rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
                struct rspamd_url *u);
+
+/**
+ * Same as rspamd_url_set_add_or_increase but returns the existing url if found
+ * @param set
+ * @param u
+ * @return
+ */
+struct rspamd_url * rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set,
+                                                                                                 struct rspamd_url *u);
 /**
  * Helper for url host set
  * @param set