From: Vsevolod Stakhov Date: Wed, 2 Jun 2021 19:50:48 +0000 (+0100) Subject: [Rework] Html: Deal with the utf_content part X-Git-Tag: 3.0~348 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=1d3c9379b9044a59e3db06697f9967ba88137a1d;p=rspamd.git [Rework] Html: Deal with the utf_content part --- diff --git a/src/libmime/message.c b/src/libmime/message.c index 21ab36e27..4bdeb6612 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -522,10 +522,10 @@ rspamd_normalize_text_part (struct rspamd_task *task, part->utf_stripped_content = g_byte_array_new (); } else { - part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len); + part->utf_stripped_content = g_byte_array_sized_new (part->utf_content.len); - p = (const gchar *)part->utf_content->data; - end = p + part->utf_content->len; + p = (const gchar *)part->utf_content.begin; + end = p + part->utf_content.len; rspamd_strip_newlines_parse (task, p, end, part); @@ -668,10 +668,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part g_assert (rspamd_multipattern_compile (gtube_matcher, NULL)); } - if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) && - part->utf_content->len <= max_check_size) { - if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data, - part->utf_content->len, + if (part->utf_content.len >= sizeof (gtube_pattern_reject) && + part->utf_content.len <= max_check_size) { + if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content.begin, + part->utf_content.len, rspamd_multipattern_gtube_cb, task, NULL)) > 0) { switch (ret) { @@ -698,7 +698,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part msg_info_task ( "gtube %s pattern has been found in part of length %ud", rspamd_action_to_str (act), - part->utf_content->len); + part->utf_content.len); } } } @@ -728,13 +728,16 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task, if (text_part->utf_raw_content != NULL) { /* Just have the same content */ - text_part->utf_content = text_part->utf_raw_content; + text_part->utf_content.begin = (const gchar *)text_part->utf_raw_content->data; + text_part->utf_content.len = text_part->utf_raw_content->len; } else { /* * We ignore unconverted parts from now as it is dangerous * to treat them as text parts */ + text_part->utf_content.begin = NULL; + text_part->utf_content.len = 0; return FALSE; } @@ -760,26 +763,21 @@ rspamd_message_process_html_text_part (struct rspamd_task *task, return FALSE; } - text_part->html = rspamd_mempool_alloc0 (task->task_pool, - sizeof (*text_part->html)); + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED; - text_part->utf_content = rspamd_html_process_part_full ( + text_part->html = rspamd_html_process_part_full ( task->task_pool, - text_part->html, text_part->utf_raw_content, &text_part->exceptions, MESSAGE_FIELD (task, urls), text_part->mime_part->urls, task->cfg ? task->cfg->enable_css_parser : false); + rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content); - if (text_part->utf_content->len == 0) { + if (text_part->utf_content.len == 0) { text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; } - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) free_byte_array_callback, - text_part->utf_content); - return TRUE; } @@ -1546,7 +1544,7 @@ rspamd_message_process (struct rspamd_task *task) sel = p2; } else { - if (p1->utf_content->len > p2->utf_content->len) { + if (p1->utf_content.len > p2->utf_content.len) { sel = p1; } else { @@ -1659,4 +1657,4 @@ void rspamd_message_update_digest (struct rspamd_message *msg, memcpy (n, msg->digest, sizeof (msg->digest)); n[0] = t1ha2_atonce128 (&n[1], input, len, n[0]); memcpy (msg->digest, n, sizeof (msg->digest)); -} \ No newline at end of file +} diff --git a/src/libmime/message.h b/src/libmime/message.h index 13e40e2ef..8805fbf30 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -138,7 +138,7 @@ struct rspamd_mime_text_part { rspamd_ftok_t parsed; /* decoded from mime encodings */ /* UTF8 content */ - GByteArray *utf_content; /* utf8 encoded processed content */ + rspamd_ftok_t utf_content; /* utf8 encoded processed content */ GByteArray *utf_raw_content; /* utf raw content */ GByteArray *utf_stripped_content; /* utf content with no newlines */ GArray *normalized_hashes; /* Array of guint64 */ diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index 99c5d3a19..a528be50c 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -1625,7 +1625,7 @@ rspamd_has_fake_html (struct rspamd_task * task, GArray * args, void *unused) gboolean res = FALSE; PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, p) { - if (IS_TEXT_PART_HTML (p) && (p->html == NULL || p->html->html_tags == NULL)) { + if (IS_TEXT_PART_HTML (p) && (p->html == NULL)) { res = TRUE; } diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 00f1d331f..47e4e81a0 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -2387,4 +2387,15 @@ rspamd_html_find_embedded_image(void *html_content, } return nullptr; +} + +bool +rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest) +{ + auto *hc = rspamd::html::html_content::from_ptr(html_content); + + dest->begin = hc->parsed.data(); + dest->len = hc->parsed.size(); + + return true; } \ No newline at end of file diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h index 1e71d0c2d..3b6592402 100644 --- a/src/libserver/html/html.h +++ b/src/libserver/html/html.h @@ -154,6 +154,14 @@ const gchar *rspamd_html_tag_name(void *tag, gsize *len); struct html_image* rspamd_html_find_embedded_image(void *html_content, const char *cid, gsize cid_len); +/** + * Stores parsed content in ftok_t structure + * @param html_content + * @param dest + * @return + */ +bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest); + #ifdef __cplusplus } diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index 1b591a81c..631981b30 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -1224,8 +1224,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, raw = TRUE; } - in = text_part->utf_content->data; - len = text_part->utf_content->len; + in = text_part->utf_content.begin; + len = text_part->utf_content.len; } } diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx index 30bfa55d6..4dd59083c 100644 --- a/src/lua/lua_html.cxx +++ b/src/lua/lua_html.cxx @@ -16,6 +16,7 @@ #include "lua_common.h" #include "message.h" #include "libserver/html/html.h" +#include "libserver/html/html.hxx" #include "libserver/html/html_tag.hxx" #include "images.h" @@ -180,12 +181,12 @@ static const struct luaL_reg taglib_m[] = { {NULL, NULL} }; -static struct html_content * +static struct rspamd::html::html_content * lua_check_html (lua_State * L, gint pos) { void *ud = rspamd_lua_check_udata (L, pos, "rspamd{html}"); luaL_argcheck (L, ud != NULL, pos, "'html' expected"); - return ud ? *((struct html_content **)ud) : NULL; + return ud ? *((struct rspamd::html::html_content **)ud) : NULL; } struct lua_html_tag { @@ -205,7 +206,7 @@ static gint lua_html_has_tag (lua_State *L) { LUA_TRACE_POINT; - struct html_content *hc = lua_check_html (L, 1); + auto *hc = lua_check_html (L, 1); const gchar *tagname = luaL_checkstring (L, 2); gboolean ret = FALSE; @@ -238,7 +239,7 @@ static gint lua_html_has_property (lua_State *L) { LUA_TRACE_POINT; - struct html_content *hc = lua_check_html (L, 1); + auto *hc = lua_check_html (L, 1); const gchar *propname = luaL_checkstring (L, 2); gboolean ret = FALSE; @@ -256,7 +257,7 @@ lua_html_has_property (lua_State *L) } static void -lua_html_push_image (lua_State *L, struct html_image *img) +lua_html_push_image (lua_State *L, const struct html_image *img) { LUA_TRACE_POINT; struct lua_html_tag *ltag; @@ -319,22 +320,15 @@ static gint lua_html_get_images (lua_State *L) { LUA_TRACE_POINT; - struct html_content *hc = lua_check_html (L, 1); - struct html_image *img; - - guint i; + auto *hc = lua_check_html (L, 1); + guint i = 1; if (hc != NULL) { - if (hc->images) { - lua_createtable (L, hc->images->len, 0); + lua_createtable (L, hc->images.size(), 0); - PTR_ARRAY_FOREACH (hc->images, i, img) { - lua_html_push_image (L, img); - lua_rawseti (L, -2, i + 1); - } - } - else { - lua_newtable (L); + for (const auto *img : hc->images) { + lua_html_push_image (L, img); + lua_rawseti (L, -2, i++); } } else { @@ -410,14 +404,14 @@ static gint lua_html_get_blocks (lua_State *L) { LUA_TRACE_POINT; - struct html_content *hc = lua_check_html (L, 1); + auto *hc = lua_check_html (L, 1); struct html_block *bl; guint i; if (hc != NULL) { - if (hc->blocks && hc->blocks->len > 0) { - lua_createtable (L, hc->blocks->len, 0); + if (hc->blocks.size() > 0) { + lua_createtable (L, hc->blocks.size(), 0); for (i = 0; i < hc->blocks->len; i ++) { bl = static_cast(g_ptr_array_index (hc->blocks, i)); diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index fe8bb4246..b6e5b157d 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -694,8 +694,8 @@ lua_textpart_get_content (lua_State * L) lua_pushnil (L); return 1; } - start = part->utf_content->data; - len = part->utf_content->len; + start = part->utf_content.begin; + len = part->utf_content.len; } else if (strcmp (type, "content") == 0) { if (IS_TEXT_PART_EMPTY (part)) { @@ -703,8 +703,8 @@ lua_textpart_get_content (lua_State * L) return 1; } - start = part->utf_content->data; - len = part->utf_content->len; + start = part->utf_content.begin; + len = part->utf_content.len; } else if (strcmp (type, "content_oneline") == 0) { if (IS_TEXT_PART_EMPTY (part)) { @@ -809,11 +809,11 @@ lua_textpart_get_length (lua_State * L) return 1; } - if (IS_TEXT_PART_EMPTY (part) || part->utf_content == NULL) { + if (IS_TEXT_PART_EMPTY (part) || part->utf_content.len == 0) { lua_pushinteger (L, 0); } else { - lua_pushinteger (L, part->utf_content->len); + lua_pushinteger (L, part->utf_content.len); } return 1; diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c index a0c2f264d..6c75d8039 100644 --- a/src/lua/lua_parsers.c +++ b/src/lua/lua_parsers.c @@ -206,9 +206,9 @@ lua_parsers_parse_html (lua_State *L) struct rspamd_lua_text *t; const gchar *start = NULL; gsize len; - GByteArray *res, *in; + GByteArray *in; rspamd_mempool_t *pool; - struct html_content *hc; + void *hc; if (lua_type (L, 1) == LUA_TUSERDATA) { t = lua_check_text (L, 1); @@ -224,19 +224,15 @@ lua_parsers_parse_html (lua_State *L) if (start != NULL) { pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0); - hc = rspamd_mempool_alloc0 (pool, sizeof (*hc)); in = g_byte_array_sized_new (len); g_byte_array_append (in, start, len); - res = rspamd_html_process_part (pool, hc, in); + hc = rspamd_html_process_part(pool, in); - t = lua_newuserdata (L, sizeof (*t)); - rspamd_lua_setclass (L, "rspamd{text}", -1); - t->start = res->data; - t->len = res->len; - t->flags = RSPAMD_TEXT_FLAG_OWN; + rspamd_ftok_t res; + rspamd_html_get_parsed_content(hc, &res); + lua_new_text(L, res.begin, res.len, TRUE); - g_byte_array_free (res, FALSE); g_byte_array_free (in, TRUE); rspamd_mempool_delete (pool); } diff --git a/src/lua/lua_trie.c b/src/lua/lua_trie.c index 33e5832a8..3b1e946ec 100644 --- a/src/lua/lua_trie.c +++ b/src/lua/lua_trie.c @@ -375,9 +375,9 @@ lua_trie_search_mime (lua_State *L) if (trie && task) { PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) { - if (!IS_TEXT_PART_EMPTY (part) && part->utf_content != NULL) { - text = part->utf_content->data; - len = part->utf_content->len; + if (!IS_TEXT_PART_EMPTY (part) && part->utf_content.len > 0) { + text = part->utf_content.begin; + len = part->utf_content.len; if (lua_trie_search_str (L, trie, text, len, cb) != 0) { found = TRUE;