@@ -522,10 +522,10 @@ rspamd_normalize_text_part (struct rspamd_task *task, | |||
part->utf_stripped_content = g_byte_array_new (); | |||
} | |||
else { | |||
part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len); | |||
part->utf_stripped_content = g_byte_array_sized_new (part->utf_content.len); | |||
p = (const gchar *)part->utf_content->data; | |||
end = p + part->utf_content->len; | |||
p = (const gchar *)part->utf_content.begin; | |||
end = p + part->utf_content.len; | |||
rspamd_strip_newlines_parse (task, p, end, part); | |||
@@ -668,10 +668,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part | |||
g_assert (rspamd_multipattern_compile (gtube_matcher, NULL)); | |||
} | |||
if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) && | |||
part->utf_content->len <= max_check_size) { | |||
if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data, | |||
part->utf_content->len, | |||
if (part->utf_content.len >= sizeof (gtube_pattern_reject) && | |||
part->utf_content.len <= max_check_size) { | |||
if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content.begin, | |||
part->utf_content.len, | |||
rspamd_multipattern_gtube_cb, task, NULL)) > 0) { | |||
switch (ret) { | |||
@@ -698,7 +698,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part | |||
msg_info_task ( | |||
"gtube %s pattern has been found in part of length %ud", | |||
rspamd_action_to_str (act), | |||
part->utf_content->len); | |||
part->utf_content.len); | |||
} | |||
} | |||
} | |||
@@ -728,13 +728,16 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task, | |||
if (text_part->utf_raw_content != NULL) { | |||
/* Just have the same content */ | |||
text_part->utf_content = text_part->utf_raw_content; | |||
text_part->utf_content.begin = (const gchar *)text_part->utf_raw_content->data; | |||
text_part->utf_content.len = text_part->utf_raw_content->len; | |||
} | |||
else { | |||
/* | |||
* We ignore unconverted parts from now as it is dangerous | |||
* to treat them as text parts | |||
*/ | |||
text_part->utf_content.begin = NULL; | |||
text_part->utf_content.len = 0; | |||
return FALSE; | |||
} | |||
@@ -760,26 +763,21 @@ rspamd_message_process_html_text_part (struct rspamd_task *task, | |||
return FALSE; | |||
} | |||
text_part->html = rspamd_mempool_alloc0 (task->task_pool, | |||
sizeof (*text_part->html)); | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED; | |||
text_part->utf_content = rspamd_html_process_part_full ( | |||
text_part->html = rspamd_html_process_part_full ( | |||
task->task_pool, | |||
text_part->html, | |||
text_part->utf_raw_content, | |||
&text_part->exceptions, | |||
MESSAGE_FIELD (task, urls), | |||
text_part->mime_part->urls, | |||
task->cfg ? task->cfg->enable_css_parser : false); | |||
rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content); | |||
if (text_part->utf_content->len == 0) { | |||
if (text_part->utf_content.len == 0) { | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; | |||
} | |||
rspamd_mempool_add_destructor (task->task_pool, | |||
(rspamd_mempool_destruct_t) free_byte_array_callback, | |||
text_part->utf_content); | |||
return TRUE; | |||
} | |||
@@ -1546,7 +1544,7 @@ rspamd_message_process (struct rspamd_task *task) | |||
sel = p2; | |||
} | |||
else { | |||
if (p1->utf_content->len > p2->utf_content->len) { | |||
if (p1->utf_content.len > p2->utf_content.len) { | |||
sel = p1; | |||
} | |||
else { | |||
@@ -1659,4 +1657,4 @@ void rspamd_message_update_digest (struct rspamd_message *msg, | |||
memcpy (n, msg->digest, sizeof (msg->digest)); | |||
n[0] = t1ha2_atonce128 (&n[1], input, len, n[0]); | |||
memcpy (msg->digest, n, sizeof (msg->digest)); | |||
} | |||
} |
@@ -138,7 +138,7 @@ struct rspamd_mime_text_part { | |||
rspamd_ftok_t parsed; /* decoded from mime encodings */ | |||
/* UTF8 content */ | |||
GByteArray *utf_content; /* utf8 encoded processed content */ | |||
rspamd_ftok_t utf_content; /* utf8 encoded processed content */ | |||
GByteArray *utf_raw_content; /* utf raw content */ | |||
GByteArray *utf_stripped_content; /* utf content with no newlines */ | |||
GArray *normalized_hashes; /* Array of guint64 */ |
@@ -1625,7 +1625,7 @@ rspamd_has_fake_html (struct rspamd_task * task, GArray * args, void *unused) | |||
gboolean res = FALSE; | |||
PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, p) { | |||
if (IS_TEXT_PART_HTML (p) && (p->html == NULL || p->html->html_tags == NULL)) { | |||
if (IS_TEXT_PART_HTML (p) && (p->html == NULL)) { | |||
res = TRUE; | |||
} | |||
@@ -2387,4 +2387,15 @@ rspamd_html_find_embedded_image(void *html_content, | |||
} | |||
return nullptr; | |||
} | |||
bool | |||
rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest) | |||
{ | |||
auto *hc = rspamd::html::html_content::from_ptr(html_content); | |||
dest->begin = hc->parsed.data(); | |||
dest->len = hc->parsed.size(); | |||
return true; | |||
} |
@@ -154,6 +154,14 @@ const gchar *rspamd_html_tag_name(void *tag, gsize *len); | |||
struct html_image* rspamd_html_find_embedded_image(void *html_content, | |||
const char *cid, gsize cid_len); | |||
/** | |||
* Stores parsed content in ftok_t structure | |||
* @param html_content | |||
* @param dest | |||
* @return | |||
*/ | |||
bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest); | |||
#ifdef __cplusplus | |||
} |
@@ -1224,8 +1224,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, | |||
raw = TRUE; | |||
} | |||
in = text_part->utf_content->data; | |||
len = text_part->utf_content->len; | |||
in = text_part->utf_content.begin; | |||
len = text_part->utf_content.len; | |||
} | |||
} | |||
@@ -16,6 +16,7 @@ | |||
#include "lua_common.h" | |||
#include "message.h" | |||
#include "libserver/html/html.h" | |||
#include "libserver/html/html.hxx" | |||
#include "libserver/html/html_tag.hxx" | |||
#include "images.h" | |||
@@ -180,12 +181,12 @@ static const struct luaL_reg taglib_m[] = { | |||
{NULL, NULL} | |||
}; | |||
static struct html_content * | |||
static struct rspamd::html::html_content * | |||
lua_check_html (lua_State * L, gint pos) | |||
{ | |||
void *ud = rspamd_lua_check_udata (L, pos, "rspamd{html}"); | |||
luaL_argcheck (L, ud != NULL, pos, "'html' expected"); | |||
return ud ? *((struct html_content **)ud) : NULL; | |||
return ud ? *((struct rspamd::html::html_content **)ud) : NULL; | |||
} | |||
struct lua_html_tag { | |||
@@ -205,7 +206,7 @@ static gint | |||
lua_html_has_tag (lua_State *L) | |||
{ | |||
LUA_TRACE_POINT; | |||
struct html_content *hc = lua_check_html (L, 1); | |||
auto *hc = lua_check_html (L, 1); | |||
const gchar *tagname = luaL_checkstring (L, 2); | |||
gboolean ret = FALSE; | |||
@@ -238,7 +239,7 @@ static gint | |||
lua_html_has_property (lua_State *L) | |||
{ | |||
LUA_TRACE_POINT; | |||
struct html_content *hc = lua_check_html (L, 1); | |||
auto *hc = lua_check_html (L, 1); | |||
const gchar *propname = luaL_checkstring (L, 2); | |||
gboolean ret = FALSE; | |||
@@ -256,7 +257,7 @@ lua_html_has_property (lua_State *L) | |||
} | |||
static void | |||
lua_html_push_image (lua_State *L, struct html_image *img) | |||
lua_html_push_image (lua_State *L, const struct html_image *img) | |||
{ | |||
LUA_TRACE_POINT; | |||
struct lua_html_tag *ltag; | |||
@@ -319,22 +320,15 @@ static gint | |||
lua_html_get_images (lua_State *L) | |||
{ | |||
LUA_TRACE_POINT; | |||
struct html_content *hc = lua_check_html (L, 1); | |||
struct html_image *img; | |||
guint i; | |||
auto *hc = lua_check_html (L, 1); | |||
guint i = 1; | |||
if (hc != NULL) { | |||
if (hc->images) { | |||
lua_createtable (L, hc->images->len, 0); | |||
lua_createtable (L, hc->images.size(), 0); | |||
PTR_ARRAY_FOREACH (hc->images, i, img) { | |||
lua_html_push_image (L, img); | |||
lua_rawseti (L, -2, i + 1); | |||
} | |||
} | |||
else { | |||
lua_newtable (L); | |||
for (const auto *img : hc->images) { | |||
lua_html_push_image (L, img); | |||
lua_rawseti (L, -2, i++); | |||
} | |||
} | |||
else { | |||
@@ -410,14 +404,14 @@ static gint | |||
lua_html_get_blocks (lua_State *L) | |||
{ | |||
LUA_TRACE_POINT; | |||
struct html_content *hc = lua_check_html (L, 1); | |||
auto *hc = lua_check_html (L, 1); | |||
struct html_block *bl; | |||
guint i; | |||
if (hc != NULL) { | |||
if (hc->blocks && hc->blocks->len > 0) { | |||
lua_createtable (L, hc->blocks->len, 0); | |||
if (hc->blocks.size() > 0) { | |||
lua_createtable (L, hc->blocks.size(), 0); | |||
for (i = 0; i < hc->blocks->len; i ++) { | |||
bl = static_cast<decltype(bl)>(g_ptr_array_index (hc->blocks, i)); |
@@ -694,8 +694,8 @@ lua_textpart_get_content (lua_State * L) | |||
lua_pushnil (L); | |||
return 1; | |||
} | |||
start = part->utf_content->data; | |||
len = part->utf_content->len; | |||
start = part->utf_content.begin; | |||
len = part->utf_content.len; | |||
} | |||
else if (strcmp (type, "content") == 0) { | |||
if (IS_TEXT_PART_EMPTY (part)) { | |||
@@ -703,8 +703,8 @@ lua_textpart_get_content (lua_State * L) | |||
return 1; | |||
} | |||
start = part->utf_content->data; | |||
len = part->utf_content->len; | |||
start = part->utf_content.begin; | |||
len = part->utf_content.len; | |||
} | |||
else if (strcmp (type, "content_oneline") == 0) { | |||
if (IS_TEXT_PART_EMPTY (part)) { | |||
@@ -809,11 +809,11 @@ lua_textpart_get_length (lua_State * L) | |||
return 1; | |||
} | |||
if (IS_TEXT_PART_EMPTY (part) || part->utf_content == NULL) { | |||
if (IS_TEXT_PART_EMPTY (part) || part->utf_content.len == 0) { | |||
lua_pushinteger (L, 0); | |||
} | |||
else { | |||
lua_pushinteger (L, part->utf_content->len); | |||
lua_pushinteger (L, part->utf_content.len); | |||
} | |||
return 1; |
@@ -206,9 +206,9 @@ lua_parsers_parse_html (lua_State *L) | |||
struct rspamd_lua_text *t; | |||
const gchar *start = NULL; | |||
gsize len; | |||
GByteArray *res, *in; | |||
GByteArray *in; | |||
rspamd_mempool_t *pool; | |||
struct html_content *hc; | |||
void *hc; | |||
if (lua_type (L, 1) == LUA_TUSERDATA) { | |||
t = lua_check_text (L, 1); | |||
@@ -224,19 +224,15 @@ lua_parsers_parse_html (lua_State *L) | |||
if (start != NULL) { | |||
pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0); | |||
hc = rspamd_mempool_alloc0 (pool, sizeof (*hc)); | |||
in = g_byte_array_sized_new (len); | |||
g_byte_array_append (in, start, len); | |||
res = rspamd_html_process_part (pool, hc, in); | |||
hc = rspamd_html_process_part(pool, in); | |||
t = lua_newuserdata (L, sizeof (*t)); | |||
rspamd_lua_setclass (L, "rspamd{text}", -1); | |||
t->start = res->data; | |||
t->len = res->len; | |||
t->flags = RSPAMD_TEXT_FLAG_OWN; | |||
rspamd_ftok_t res; | |||
rspamd_html_get_parsed_content(hc, &res); | |||
lua_new_text(L, res.begin, res.len, TRUE); | |||
g_byte_array_free (res, FALSE); | |||
g_byte_array_free (in, TRUE); | |||
rspamd_mempool_delete (pool); | |||
} |
@@ -375,9 +375,9 @@ lua_trie_search_mime (lua_State *L) | |||
if (trie && task) { | |||
PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) { | |||
if (!IS_TEXT_PART_EMPTY (part) && part->utf_content != NULL) { | |||
text = part->utf_content->data; | |||
len = part->utf_content->len; | |||
if (!IS_TEXT_PART_EMPTY (part) && part->utf_content.len > 0) { | |||
text = part->utf_content.begin; | |||
len = part->utf_content.len; | |||
if (lua_trie_search_str (L, trie, text, len, cb) != 0) { | |||
found = TRUE; |