]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Html: Deal with the utf_content part
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 2 Jun 2021 19:50:48 +0000 (20:50 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 2 Jun 2021 19:55:10 +0000 (20:55 +0100)
src/libmime/message.c
src/libmime/message.h
src/libmime/mime_expressions.c
src/libserver/html/html.cxx
src/libserver/html/html.h
src/libserver/re_cache.c
src/lua/lua_html.cxx
src/lua/lua_mimepart.c
src/lua/lua_parsers.c
src/lua/lua_trie.c

index 21ab36e27dd10df08306cb3cfa64a306b9f6176a..4bdeb661275163a8d88980db5e18a37bd09bf594 100644 (file)
@@ -522,10 +522,10 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                part->utf_stripped_content = g_byte_array_new ();
        }
        else {
-               part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len);
+               part->utf_stripped_content = g_byte_array_sized_new (part->utf_content.len);
 
-               p = (const gchar *)part->utf_content->data;
-               end = p + part->utf_content->len;
+               p = (const gchar *)part->utf_content.begin;
+               end = p + part->utf_content.len;
 
                rspamd_strip_newlines_parse (task, p, end, part);
 
@@ -668,10 +668,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
                g_assert (rspamd_multipattern_compile (gtube_matcher, NULL));
        }
 
-       if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) &&
-                       part->utf_content->len <= max_check_size) {
-               if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data,
-                               part->utf_content->len,
+       if (part->utf_content.len >= sizeof (gtube_pattern_reject) &&
+                       part->utf_content.len <= max_check_size) {
+               if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content.begin,
+                               part->utf_content.len,
                                rspamd_multipattern_gtube_cb, task, NULL)) > 0) {
 
                        switch (ret) {
@@ -698,7 +698,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
                                msg_info_task (
                                                "gtube %s pattern has been found in part of length %ud",
                                                rspamd_action_to_str (act),
-                                               part->utf_content->len);
+                                               part->utf_content.len);
                        }
                }
        }
@@ -728,13 +728,16 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,
 
        if (text_part->utf_raw_content != NULL) {
                /* Just have the same content */
-               text_part->utf_content = text_part->utf_raw_content;
+               text_part->utf_content.begin = (const gchar *)text_part->utf_raw_content->data;
+               text_part->utf_content.len = text_part->utf_raw_content->len;
        }
        else {
                /*
                 * We ignore unconverted parts from now as it is dangerous
                 * to treat them as text parts
                 */
+               text_part->utf_content.begin = NULL;
+               text_part->utf_content.len = 0;
 
                return FALSE;
        }
@@ -760,26 +763,21 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
                return FALSE;
        }
 
-       text_part->html = rspamd_mempool_alloc0 (task->task_pool,
-                       sizeof (*text_part->html));
+
        text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
-       text_part->utf_content = rspamd_html_process_part_full (
+       text_part->html = rspamd_html_process_part_full (
                        task->task_pool,
-                       text_part->html,
                        text_part->utf_raw_content,
                        &text_part->exceptions,
                        MESSAGE_FIELD (task, urls),
                        text_part->mime_part->urls,
                        task->cfg ? task->cfg->enable_css_parser : false);
+       rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
 
-       if (text_part->utf_content->len == 0) {
+       if (text_part->utf_content.len == 0) {
                text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
        }
 
-       rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t) free_byte_array_callback,
-                       text_part->utf_content);
-
        return TRUE;
 }
 
@@ -1546,7 +1544,7 @@ rspamd_message_process (struct rspamd_task *task)
                                                sel = p2;
                                        }
                                        else {
-                                               if (p1->utf_content->len > p2->utf_content->len) {
+                                               if (p1->utf_content.len > p2->utf_content.len) {
                                                        sel = p1;
                                                }
                                                else {
@@ -1659,4 +1657,4 @@ void rspamd_message_update_digest (struct rspamd_message *msg,
        memcpy (n, msg->digest, sizeof (msg->digest));
        n[0] = t1ha2_atonce128 (&n[1], input, len, n[0]);
        memcpy (msg->digest, n, sizeof (msg->digest));
-}
\ No newline at end of file
+}
index 13e40e2eff28150ef43ab34e411581b13a9dd4de..8805fbf30eb3fe1eaab5c1a68d74b982114fa752 100644 (file)
@@ -138,7 +138,7 @@ struct rspamd_mime_text_part {
        rspamd_ftok_t parsed; /* decoded from mime encodings */
 
        /* UTF8 content */
-       GByteArray *utf_content; /* utf8 encoded processed content */
+       rspamd_ftok_t utf_content; /* utf8 encoded processed content */
        GByteArray *utf_raw_content; /* utf raw content */
        GByteArray *utf_stripped_content; /* utf content with no newlines */
        GArray *normalized_hashes; /* Array of guint64 */
index 99c5d3a195d102f17e1b652d77404a6fd532eb50..a528be50cfd57524b78d6f8cc2d86f82eac79412 100644 (file)
@@ -1625,7 +1625,7 @@ rspamd_has_fake_html (struct rspamd_task * task, GArray * args, void *unused)
        gboolean res = FALSE;
 
        PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, p) {
-               if (IS_TEXT_PART_HTML (p) && (p->html == NULL || p->html->html_tags == NULL)) {
+               if (IS_TEXT_PART_HTML (p) && (p->html == NULL)) {
                        res = TRUE;
                }
 
index 00f1d331f38f4ca6e25e35eb3e42d270b6ba7bfb..47e4e81a077f95395a372fdd58ab9acb385fa6e1 100644 (file)
@@ -2387,4 +2387,15 @@ rspamd_html_find_embedded_image(void *html_content,
        }
 
        return nullptr;
+}
+
+bool
+rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
+{
+       auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+       dest->begin = hc->parsed.data();
+       dest->len = hc->parsed.size();
+
+       return true;
 }
\ No newline at end of file
index 1e71d0c2d0706489352ef46f0f6767ee2222fc7b..3b6592402e3e115a0a9ecebe41becea51e48b272 100644 (file)
@@ -154,6 +154,14 @@ const gchar *rspamd_html_tag_name(void *tag, gsize *len);
 struct html_image* rspamd_html_find_embedded_image(void *html_content,
                const char *cid, gsize cid_len);
 
+/**
+ * Stores parsed content in ftok_t structure
+ * @param html_content
+ * @param dest
+ * @return
+ */
+bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest);
+
 
 #ifdef  __cplusplus
 }
index 1b591a81cb7c6508677b0ef5bc2b89b15b8d08ec..631981b30d62cbb20992f7ae7ef202faeca339e1 100644 (file)
@@ -1224,8 +1224,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
                                                        raw = TRUE;
                                                }
 
-                                               in = text_part->utf_content->data;
-                                               len = text_part->utf_content->len;
+                                               in = text_part->utf_content.begin;
+                                               len = text_part->utf_content.len;
                                        }
                                }
 
index 30bfa55d61ff08327beec2e3bdc58fec81b1014e..4dd59083c3eb28bbea0ab246b83935dc0f62d248 100644 (file)
@@ -16,6 +16,7 @@
 #include "lua_common.h"
 #include "message.h"
 #include "libserver/html/html.h"
+#include "libserver/html/html.hxx"
 #include "libserver/html/html_tag.hxx"
 #include "images.h"
 
@@ -180,12 +181,12 @@ static const struct luaL_reg taglib_m[] = {
        {NULL, NULL}
 };
 
-static struct html_content *
+static struct rspamd::html::html_content *
 lua_check_html (lua_State * L, gint pos)
 {
        void *ud = rspamd_lua_check_udata (L, pos, "rspamd{html}");
        luaL_argcheck (L, ud != NULL, pos, "'html' expected");
-       return ud ? *((struct html_content **)ud) : NULL;
+       return ud ? *((struct rspamd::html::html_content **)ud) : NULL;
 }
 
 struct lua_html_tag {
@@ -205,7 +206,7 @@ static gint
 lua_html_has_tag (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_content *hc = lua_check_html (L, 1);
+       auto *hc = lua_check_html (L, 1);
        const gchar *tagname = luaL_checkstring (L, 2);
        gboolean ret = FALSE;
 
@@ -238,7 +239,7 @@ static gint
 lua_html_has_property (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_content *hc = lua_check_html (L, 1);
+       auto *hc = lua_check_html (L, 1);
        const gchar *propname = luaL_checkstring (L, 2);
        gboolean ret = FALSE;
 
@@ -256,7 +257,7 @@ lua_html_has_property (lua_State *L)
 }
 
 static void
-lua_html_push_image (lua_State *L, struct html_image *img)
+lua_html_push_image (lua_State *L, const struct html_image *img)
 {
        LUA_TRACE_POINT;
        struct lua_html_tag *ltag;
@@ -319,22 +320,15 @@ static gint
 lua_html_get_images (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_content *hc = lua_check_html (L, 1);
-       struct html_image *img;
-
-       guint i;
+       auto *hc = lua_check_html (L, 1);
+       guint i = 1;
 
        if (hc != NULL) {
-               if (hc->images) {
-                       lua_createtable (L, hc->images->len, 0);
+               lua_createtable (L, hc->images.size(), 0);
 
-                       PTR_ARRAY_FOREACH (hc->images, i, img) {
-                               lua_html_push_image (L, img);
-                               lua_rawseti (L, -2, i + 1);
-                       }
-               }
-               else {
-                       lua_newtable (L);
+               for (const auto *img : hc->images) {
+                       lua_html_push_image (L, img);
+                       lua_rawseti (L, -2, i++);
                }
        }
        else {
@@ -410,14 +404,14 @@ static gint
 lua_html_get_blocks (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_content *hc = lua_check_html (L, 1);
+       auto *hc = lua_check_html (L, 1);
        struct html_block *bl;
 
        guint i;
 
        if (hc != NULL) {
-               if (hc->blocks && hc->blocks->len > 0) {
-                       lua_createtable (L, hc->blocks->len, 0);
+               if (hc->blocks.size() > 0) {
+                       lua_createtable (L, hc->blocks.size(), 0);
 
                        for (i = 0; i < hc->blocks->len; i ++) {
                                bl = static_cast<decltype(bl)>(g_ptr_array_index (hc->blocks, i));
index fe8bb424613f5405de5aab5ae49790175b8be4e5..b6e5b157d3421f498966fe291338d9b9ed23902c 100644 (file)
@@ -694,8 +694,8 @@ lua_textpart_get_content (lua_State * L)
                        lua_pushnil (L);
                        return 1;
                }
-               start = part->utf_content->data;
-               len = part->utf_content->len;
+               start = part->utf_content.begin;
+               len = part->utf_content.len;
        }
        else if (strcmp (type, "content") == 0) {
                if (IS_TEXT_PART_EMPTY (part)) {
@@ -703,8 +703,8 @@ lua_textpart_get_content (lua_State * L)
                        return 1;
                }
 
-               start = part->utf_content->data;
-               len = part->utf_content->len;
+               start = part->utf_content.begin;
+               len = part->utf_content.len;
        }
        else if (strcmp (type, "content_oneline") == 0) {
                if (IS_TEXT_PART_EMPTY (part)) {
@@ -809,11 +809,11 @@ lua_textpart_get_length (lua_State * L)
                return 1;
        }
 
-       if (IS_TEXT_PART_EMPTY (part) || part->utf_content == NULL) {
+       if (IS_TEXT_PART_EMPTY (part) || part->utf_content.len == 0) {
                lua_pushinteger (L, 0);
        }
        else {
-               lua_pushinteger (L, part->utf_content->len);
+               lua_pushinteger (L, part->utf_content.len);
        }
 
        return 1;
index a0c2f264d70cd27d17d31e489332a73812df9274..6c75d8039092986b2ab927a4517e3b9c2c9fcc43 100644 (file)
@@ -206,9 +206,9 @@ lua_parsers_parse_html (lua_State *L)
        struct rspamd_lua_text *t;
        const gchar *start = NULL;
        gsize len;
-       GByteArray *res, *in;
+       GByteArray *in;
        rspamd_mempool_t *pool;
-       struct html_content *hc;
+       void *hc;
 
        if (lua_type (L, 1) == LUA_TUSERDATA) {
                t = lua_check_text (L, 1);
@@ -224,19 +224,15 @@ lua_parsers_parse_html (lua_State *L)
 
        if (start != NULL) {
                pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0);
-               hc = rspamd_mempool_alloc0 (pool, sizeof (*hc));
                in = g_byte_array_sized_new (len);
                g_byte_array_append (in, start, len);
 
-               res = rspamd_html_process_part (pool, hc, in);
+               hc = rspamd_html_process_part(pool, in);
 
-               t = lua_newuserdata (L, sizeof (*t));
-               rspamd_lua_setclass (L, "rspamd{text}", -1);
-               t->start = res->data;
-               t->len = res->len;
-               t->flags = RSPAMD_TEXT_FLAG_OWN;
+               rspamd_ftok_t res;
+               rspamd_html_get_parsed_content(hc, &res);
+               lua_new_text(L, res.begin, res.len, TRUE);
 
-               g_byte_array_free (res, FALSE);
                g_byte_array_free (in, TRUE);
                rspamd_mempool_delete (pool);
        }
index 33e5832a88e2c119681ebd125f9b89620ea9ed76..3b1e946ec777488474ae8ebd7f8d549c20f87a27 100644 (file)
@@ -375,9 +375,9 @@ lua_trie_search_mime (lua_State *L)
 
        if (trie && task) {
                PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) {
-                       if (!IS_TEXT_PART_EMPTY (part) && part->utf_content != NULL) {
-                               text = part->utf_content->data;
-                               len = part->utf_content->len;
+                       if (!IS_TEXT_PART_EMPTY (part) && part->utf_content.len > 0) {
+                               text = part->utf_content.begin;
+                               len = part->utf_content.len;
 
                                if (lua_trie_search_str (L, trie, text, len, cb) != 0) {
                                        found = TRUE;