]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Rework HTML content urls extraction
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 2 Apr 2019 10:07:53 +0000 (11:07 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 2 Apr 2019 10:07:53 +0000 (11:07 +0100)
src/libmime/message.c
src/libserver/html.c
src/libserver/url.c
src/libserver/url.h
src/lua/lua_url.c

index cca134f81fe4126c5424d751141464c7cbdfb2e0..6825bc2f026a76ae68baf2697201f9151e71995c 100644 (file)
@@ -912,7 +912,12 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
        rspamd_normalize_text_part (task, text_part);
 
        if (!IS_PART_HTML (text_part)) {
-               rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
+               rspamd_url_text_extract (task->task_pool, task, text_part,
+                               RSPAMD_URL_FIND_ALL);
+       }
+       else {
+               rspamd_url_text_extract (task->task_pool, task, text_part,
+                               RSPAMD_URL_FIND_STRICT);
        }
 
        if (text_part->exceptions) {
@@ -1231,7 +1236,8 @@ rspamd_message_parse (struct rspamd_task *task)
                p = task->subject;
                len = strlen (p);
                rspamd_cryptobox_hash_update (&st, p, len);
-               rspamd_url_find_multiple (task->task_pool, p, len, FALSE, NULL,
+               rspamd_url_find_multiple (task->task_pool, p, len,
+                               RSPAMD_URL_FIND_STRICT, NULL,
                                rspamd_url_task_subject_callback, task);
        }
 
index 6df545f002462376a6ea0e2a05523dc377553dc8..41925609e9d8b3399584f28a525c33c5dc7a26ad 100644 (file)
@@ -598,7 +598,8 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
        }
 
        if (end > url_text + 4 &&
-                       rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE,
+                       rspamd_url_find (pool, url_text, end - url_text, &url_str,
+                                       RSPAMD_URL_FIND_ALL,
                                        &url_pos, NULL) &&
                        url_str != NULL) {
                if (url_pos > 0) {
@@ -1569,7 +1570,8 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
 
        if (url->querylen > 0) {
 
-               if (rspamd_url_find (pool, url->query, url->querylen, &url_str, FALSE,
+               if (rspamd_url_find (pool, url->query, url->querylen, &url_str,
+                               RSPAMD_URL_FIND_ALL,
                                NULL, &prefix_added)) {
                        query_url = rspamd_mempool_alloc0 (pool,
                                        sizeof (struct rspamd_url));
index f0f5bb21b1cdaba6ed4171602cc00f0f93e4c640..d774eb44010326e0922bedd4cef27cbf2451cdb2 100644 (file)
@@ -205,7 +205,7 @@ struct url_matcher static_matchers[] = {
                {"sip:",      "",          url_web_start,   url_web_end,
                                0, 0},
                {"www.",      "http://",   url_web_start,   url_web_end,
-                               0, 0},
+                               URL_FLAG_NOHTML, 0},
                {"ftp.",      "ftp://",    url_web_start,   url_web_end,
                                URL_FLAG_NOHTML, 0},
                /* Likely emails */
@@ -218,7 +218,7 @@ struct url_callback_data {
        gchar *url_str;
        rspamd_mempool_t *pool;
        gint len;
-       gboolean is_html;
+       enum rspamd_url_find_type how;
        gboolean prefix_added;
        guint newline_idx;
        GPtrArray *newlines;
@@ -2584,12 +2584,12 @@ rspamd_url_trie_is_match (struct url_matcher *matcher, const gchar *pos,
 
 static gint
 rspamd_url_trie_callback (struct rspamd_multipattern *mp,
-               guint strnum,
-               gint match_start,
-               gint match_pos,
-               const gchar *text,
-               gsize len,
-               void *context)
+                                                 guint strnum,
+                                                 gint match_start,
+                                                 gint match_pos,
+                                                 const gchar *text,
+                                                 gsize len,
+                                                 void *context)
 {
        struct url_matcher *matcher;
        url_match_t m;
@@ -2599,7 +2599,7 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp,
        matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
                        strnum);
 
-       if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
+       if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
                /* Do not try to match non-html like urls in html texts */
                return 0;
        }
@@ -2669,9 +2669,12 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp,
 }
 
 gboolean
-rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
-               gchar **url_str, gboolean is_html, goffset *url_pos,
-               gboolean *prefix_added)
+rspamd_url_find (rspamd_mempool_t *pool,
+                                const gchar *begin, gsize len,
+                                gchar **url_str,
+                                enum rspamd_url_find_type how,
+                                goffset *url_pos,
+                                gboolean *prefix_added)
 {
        struct url_callback_data cb;
        gint ret;
@@ -2679,7 +2682,7 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
        memset (&cb, 0, sizeof (cb));
        cb.begin = begin;
        cb.end = begin + len;
-       cb.is_html = is_html;
+       cb.how = how;
        cb.pool = pool;
 
        ret = rspamd_multipattern_lookup (url_scanner->search_trie, begin, len,
@@ -2706,13 +2709,13 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
 
 static gint
 rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
-               guint strnum,
-               gint match_start,
-               gint match_pos,
-               const gchar *text,
-               gsize len,
-               void *context,
-               gboolean multiple)
+                                                                                guint strnum,
+                                                                                gint match_start,
+                                                                                gint match_pos,
+                                                                                const gchar *text,
+                                                                                gsize len,
+                                                                                void *context,
+                                                                                gboolean multiple)
 {
        struct rspamd_url *url;
        struct url_matcher *matcher;
@@ -2726,7 +2729,7 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
                        strnum);
        pool = cb->pool;
 
-       if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
+       if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
                /* Do not try to match non-html like urls in html texts */
                return 0;
        }
@@ -2894,7 +2897,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
        /* We also search the query for additional url inside */
        if (url->querylen > 0) {
                if (rspamd_url_find (task->task_pool, url->query, url->querylen,
-                               &url_str, IS_PART_HTML (cbd->part), NULL, &prefix_added)) {
+                               &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
 
                        query_url = rspamd_mempool_alloc0 (task->task_pool,
                                        sizeof (struct rspamd_url));
@@ -2938,9 +2941,9 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
 
 void
 rspamd_url_text_extract (rspamd_mempool_t *pool,
-               struct rspamd_task *task,
-               struct rspamd_mime_text_part *part,
-               gboolean is_html)
+                                                struct rspamd_task *task,
+                                                struct rspamd_mime_text_part *part,
+                                                enum rspamd_url_find_type how)
 {
        struct rspamd_url_mimepart_cbdata mcbd;
 
@@ -2953,14 +2956,18 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
        mcbd.part = part;
 
        rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
-                       part->utf_stripped_content->len, is_html, part->newlines,
+                       part->utf_stripped_content->len, how, part->newlines,
                        rspamd_url_text_part_callback, &mcbd);
 }
 
 void
-rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
-               gsize inlen, gboolean is_html, GPtrArray *nlines,
-               url_insert_function func, gpointer ud)
+rspamd_url_find_multiple (rspamd_mempool_t *pool,
+                                                 const gchar *in,
+                                                 gsize inlen,
+                                                 enum rspamd_url_find_type how,
+                                                 GPtrArray *nlines,
+                                                 url_insert_function func,
+                                                 gpointer ud)
 {
        struct url_callback_data cb;
 
@@ -2973,7 +2980,7 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
        memset (&cb, 0, sizeof (cb));
        cb.begin = in;
        cb.end = in + inlen;
-       cb.is_html = is_html;
+       cb.how = how;
        cb.pool = pool;
 
        cb.funcd = ud;
@@ -2986,9 +2993,12 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
 }
 
 void
-rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
-               gsize inlen, gboolean is_html,
-               url_insert_function func, gpointer ud)
+rspamd_url_find_single (rspamd_mempool_t *pool,
+                                               const gchar *in,
+                                               gsize inlen,
+                                               enum rspamd_url_find_type how,
+                                               url_insert_function func,
+                                               gpointer ud)
 {
        struct url_callback_data cb;
 
@@ -3001,7 +3011,7 @@ rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
        memset (&cb, 0, sizeof (cb));
        cb.begin = in;
        cb.end = in + inlen;
-       cb.is_html = is_html;
+       cb.how = how;
        cb.pool = pool;
 
        cb.funcd = ud;
@@ -3049,7 +3059,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
        /* We also search the query for additional url inside */
        if (url->querylen > 0) {
                if (rspamd_url_find (task->task_pool, url->query, url->querylen,
-                               &url_str, FALSE, NULL, &prefix_added)) {
+                               &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
 
                        query_url = rspamd_mempool_alloc0 (task->task_pool,
                                        sizeof (struct rspamd_url));
index 2cf80df4b922ecad247b5cd215a3f244629a9e96..2243534dc3cac7e4d674db3f89889b1b36f825ce 100644 (file)
@@ -90,6 +90,17 @@ enum rspamd_url_protocol {
        PROTOCOL_UNKNOWN = 1u << 31,
 };
 
+enum rspamd_url_parse_flags {
+       RSPAMD_URL_PARSE_TEXT = 0,
+       RSPAMD_URL_PARSE_HREF = (1u << 0),
+       RSPAMD_URL_PARSE_CHECK = (1 << 1),
+};
+
+enum rspamd_url_find_type {
+       RSPAMD_URL_FIND_ALL = 0,
+       RSPAMD_URL_FIND_STRICT,
+};
+
 /**
  * Initialize url library
  * @param cfg
@@ -104,15 +115,9 @@ void rspamd_url_deinit (void);
  * @param is_html turn on html euristic
  */
 void rspamd_url_text_extract (rspamd_mempool_t *pool,
-       struct rspamd_task *task,
-       struct rspamd_mime_text_part *part,
-       gboolean is_html);
-
-enum rspamd_url_parse_flags {
-       RSPAMD_URL_PARSE_TEXT = 0,
-       RSPAMD_URL_PARSE_HREF = (1u << 0),
-       RSPAMD_URL_PARSE_CHECK = (1 << 1),
-};
+                                                         struct rspamd_task *task,
+                                                         struct rspamd_mime_text_part *part,
+                                                         enum rspamd_url_find_type how);
 
 /*
  * Parse a single url into an uri structure
@@ -136,9 +141,12 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
  * @param url_str storage for url string(or NULL)
  * @return TRUE if url is found in specified text
  */
-gboolean rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
-               gchar **url_str, gboolean is_html, goffset *url_pos,
-               gboolean *prefix_added);
+gboolean rspamd_url_find (rspamd_mempool_t *pool,
+                                                 const gchar *begin, gsize len,
+                                                 gchar **url_str,
+                                                 enum rspamd_url_find_type how,
+                                                 goffset *url_pos,
+                                                 gboolean *prefix_added);
 /*
  * Return text representation of url parsing error
  */
@@ -166,9 +174,12 @@ typedef void (*url_insert_function) (struct rspamd_url *url,
  * @param func
  * @param ud
  */
-void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
-               gsize inlen, gboolean is_html, GPtrArray *nlines,
-               url_insert_function func, gpointer ud);
+void rspamd_url_find_multiple (rspamd_mempool_t *pool,
+                                                          const gchar *in, gsize inlen,
+                                                          enum rspamd_url_find_type how,
+                                                          GPtrArray *nlines,
+                                                          url_insert_function func,
+                                                          gpointer ud);
 /**
  * Search for a single url in text and call `func` for each url found
  * @param pool
@@ -178,9 +189,11 @@ void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
  * @param func
  * @param ud
  */
-void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
-               gsize inlen, gboolean is_html,
-               url_insert_function func, gpointer ud);
+void rspamd_url_find_single (rspamd_mempool_t *pool,
+                                                        const gchar *in, gsize inlen,
+                                                        enum rspamd_url_find_type how,
+                                                        url_insert_function func,
+                                                        gpointer ud);
 
 /**
  * Generic callback to insert URLs into rspamd_task
@@ -190,8 +203,8 @@ void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
  * @param ud
  */
 void rspamd_url_task_subject_callback (struct rspamd_url *url,
-               gsize start_offset,
-               gsize end_offset, gpointer ud);
+                                                                          gsize start_offset,
+                                                                          gsize end_offset, gpointer ud);
 
 /**
  * Adds a tag for url
@@ -200,8 +213,8 @@ void rspamd_url_task_subject_callback (struct rspamd_url *url,
  * @param pool
  */
 void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,
-               const gchar *value,
-               rspamd_mempool_t *pool);
+                                                const gchar *value,
+                                                rspamd_mempool_t *pool);
 
 guint rspamd_url_hash (gconstpointer u);
 guint rspamd_email_hash (gconstpointer u);
@@ -232,7 +245,7 @@ gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
  * @return
  */
 const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
-               rspamd_mempool_t *pool);
+                                                                rspamd_mempool_t *pool);
 
 
 /**
index 9bc984da309f7e98d65459522af03548e6d8b178..a0f8c464815dd8cfef1bab2d510f10f85949bb41 100644 (file)
@@ -799,7 +799,7 @@ lua_url_create (lua_State *L)
                return luaL_error (L, "invalid arguments");
        }
        else {
-               rspamd_url_find_single (pool, text, length, FALSE,
+               rspamd_url_find_single (pool, text, length, RSPAMD_URL_FIND_ALL,
                                lua_url_single_inserter, L);
 
                if (lua_type (L, -1) != LUA_TUSERDATA) {
@@ -867,7 +867,8 @@ lua_url_all (lua_State *L)
 
                if (text != NULL) {
                        lua_newtable (L);
-                       rspamd_url_find_multiple (pool, text, length, FALSE, NULL,
+                       rspamd_url_find_multiple (pool, text, length,
+                                       RSPAMD_URL_FIND_ALL, NULL,
                                        lua_url_table_inserter, L);
 
                }