Browse Source

[Rework] Html: Deal with the utf_content part

tags/3.0
Vsevolod Stakhov 3 years ago
parent
commit
1d3c9379b9

+ 18
- 20
src/libmime/message.c View File

@@ -522,10 +522,10 @@ rspamd_normalize_text_part (struct rspamd_task *task,
part->utf_stripped_content = g_byte_array_new ();
}
else {
part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len);
part->utf_stripped_content = g_byte_array_sized_new (part->utf_content.len);

p = (const gchar *)part->utf_content->data;
end = p + part->utf_content->len;
p = (const gchar *)part->utf_content.begin;
end = p + part->utf_content.len;

rspamd_strip_newlines_parse (task, p, end, part);

@@ -668,10 +668,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
g_assert (rspamd_multipattern_compile (gtube_matcher, NULL));
}

if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) &&
part->utf_content->len <= max_check_size) {
if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data,
part->utf_content->len,
if (part->utf_content.len >= sizeof (gtube_pattern_reject) &&
part->utf_content.len <= max_check_size) {
if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content.begin,
part->utf_content.len,
rspamd_multipattern_gtube_cb, task, NULL)) > 0) {

switch (ret) {
@@ -698,7 +698,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
msg_info_task (
"gtube %s pattern has been found in part of length %ud",
rspamd_action_to_str (act),
part->utf_content->len);
part->utf_content.len);
}
}
}
@@ -728,13 +728,16 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,

if (text_part->utf_raw_content != NULL) {
/* Just have the same content */
text_part->utf_content = text_part->utf_raw_content;
text_part->utf_content.begin = (const gchar *)text_part->utf_raw_content->data;
text_part->utf_content.len = text_part->utf_raw_content->len;
}
else {
/*
* We ignore unconverted parts from now as it is dangerous
* to treat them as text parts
*/
text_part->utf_content.begin = NULL;
text_part->utf_content.len = 0;

return FALSE;
}
@@ -760,26 +763,21 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
return FALSE;
}

text_part->html = rspamd_mempool_alloc0 (task->task_pool,
sizeof (*text_part->html));

text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
text_part->utf_content = rspamd_html_process_part_full (
text_part->html = rspamd_html_process_part_full (
task->task_pool,
text_part->html,
text_part->utf_raw_content,
&text_part->exceptions,
MESSAGE_FIELD (task, urls),
text_part->mime_part->urls,
task->cfg ? task->cfg->enable_css_parser : false);
rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);

if (text_part->utf_content->len == 0) {
if (text_part->utf_content.len == 0) {
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
}

rspamd_mempool_add_destructor (task->task_pool,
(rspamd_mempool_destruct_t) free_byte_array_callback,
text_part->utf_content);

return TRUE;
}

@@ -1546,7 +1544,7 @@ rspamd_message_process (struct rspamd_task *task)
sel = p2;
}
else {
if (p1->utf_content->len > p2->utf_content->len) {
if (p1->utf_content.len > p2->utf_content.len) {
sel = p1;
}
else {
@@ -1659,4 +1657,4 @@ void rspamd_message_update_digest (struct rspamd_message *msg,
memcpy (n, msg->digest, sizeof (msg->digest));
n[0] = t1ha2_atonce128 (&n[1], input, len, n[0]);
memcpy (msg->digest, n, sizeof (msg->digest));
}
}

+ 1
- 1
src/libmime/message.h View File

@@ -138,7 +138,7 @@ struct rspamd_mime_text_part {
rspamd_ftok_t parsed; /* decoded from mime encodings */

/* UTF8 content */
GByteArray *utf_content; /* utf8 encoded processed content */
rspamd_ftok_t utf_content; /* utf8 encoded processed content */
GByteArray *utf_raw_content; /* utf raw content */
GByteArray *utf_stripped_content; /* utf content with no newlines */
GArray *normalized_hashes; /* Array of guint64 */

+ 1
- 1
src/libmime/mime_expressions.c View File

@@ -1625,7 +1625,7 @@ rspamd_has_fake_html (struct rspamd_task * task, GArray * args, void *unused)
gboolean res = FALSE;

PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, p) {
if (IS_TEXT_PART_HTML (p) && (p->html == NULL || p->html->html_tags == NULL)) {
if (IS_TEXT_PART_HTML (p) && (p->html == NULL)) {
res = TRUE;
}


+ 11
- 0
src/libserver/html/html.cxx View File

@@ -2387,4 +2387,15 @@ rspamd_html_find_embedded_image(void *html_content,
}

return nullptr;
}

bool
rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
{
auto *hc = rspamd::html::html_content::from_ptr(html_content);

dest->begin = hc->parsed.data();
dest->len = hc->parsed.size();

return true;
}

+ 8
- 0
src/libserver/html/html.h View File

@@ -154,6 +154,14 @@ const gchar *rspamd_html_tag_name(void *tag, gsize *len);
struct html_image* rspamd_html_find_embedded_image(void *html_content,
const char *cid, gsize cid_len);

/**
* Stores parsed content in ftok_t structure
* @param html_content
* @param dest
* @return
*/
bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest);


#ifdef __cplusplus
}

+ 2
- 2
src/libserver/re_cache.c View File

@@ -1224,8 +1224,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
raw = TRUE;
}

in = text_part->utf_content->data;
len = text_part->utf_content->len;
in = text_part->utf_content.begin;
len = text_part->utf_content.len;
}
}


+ 15
- 21
src/lua/lua_html.cxx View File

@@ -16,6 +16,7 @@
#include "lua_common.h"
#include "message.h"
#include "libserver/html/html.h"
#include "libserver/html/html.hxx"
#include "libserver/html/html_tag.hxx"
#include "images.h"

@@ -180,12 +181,12 @@ static const struct luaL_reg taglib_m[] = {
{NULL, NULL}
};

static struct html_content *
static struct rspamd::html::html_content *
lua_check_html (lua_State * L, gint pos)
{
void *ud = rspamd_lua_check_udata (L, pos, "rspamd{html}");
luaL_argcheck (L, ud != NULL, pos, "'html' expected");
return ud ? *((struct html_content **)ud) : NULL;
return ud ? *((struct rspamd::html::html_content **)ud) : NULL;
}

struct lua_html_tag {
@@ -205,7 +206,7 @@ static gint
lua_html_has_tag (lua_State *L)
{
LUA_TRACE_POINT;
struct html_content *hc = lua_check_html (L, 1);
auto *hc = lua_check_html (L, 1);
const gchar *tagname = luaL_checkstring (L, 2);
gboolean ret = FALSE;

@@ -238,7 +239,7 @@ static gint
lua_html_has_property (lua_State *L)
{
LUA_TRACE_POINT;
struct html_content *hc = lua_check_html (L, 1);
auto *hc = lua_check_html (L, 1);
const gchar *propname = luaL_checkstring (L, 2);
gboolean ret = FALSE;

@@ -256,7 +257,7 @@ lua_html_has_property (lua_State *L)
}

static void
lua_html_push_image (lua_State *L, struct html_image *img)
lua_html_push_image (lua_State *L, const struct html_image *img)
{
LUA_TRACE_POINT;
struct lua_html_tag *ltag;
@@ -319,22 +320,15 @@ static gint
lua_html_get_images (lua_State *L)
{
LUA_TRACE_POINT;
struct html_content *hc = lua_check_html (L, 1);
struct html_image *img;

guint i;
auto *hc = lua_check_html (L, 1);
guint i = 1;

if (hc != NULL) {
if (hc->images) {
lua_createtable (L, hc->images->len, 0);
lua_createtable (L, hc->images.size(), 0);

PTR_ARRAY_FOREACH (hc->images, i, img) {
lua_html_push_image (L, img);
lua_rawseti (L, -2, i + 1);
}
}
else {
lua_newtable (L);
for (const auto *img : hc->images) {
lua_html_push_image (L, img);
lua_rawseti (L, -2, i++);
}
}
else {
@@ -410,14 +404,14 @@ static gint
lua_html_get_blocks (lua_State *L)
{
LUA_TRACE_POINT;
struct html_content *hc = lua_check_html (L, 1);
auto *hc = lua_check_html (L, 1);
struct html_block *bl;

guint i;

if (hc != NULL) {
if (hc->blocks && hc->blocks->len > 0) {
lua_createtable (L, hc->blocks->len, 0);
if (hc->blocks.size() > 0) {
lua_createtable (L, hc->blocks.size(), 0);

for (i = 0; i < hc->blocks->len; i ++) {
bl = static_cast<decltype(bl)>(g_ptr_array_index (hc->blocks, i));

+ 6
- 6
src/lua/lua_mimepart.c View File

@@ -694,8 +694,8 @@ lua_textpart_get_content (lua_State * L)
lua_pushnil (L);
return 1;
}
start = part->utf_content->data;
len = part->utf_content->len;
start = part->utf_content.begin;
len = part->utf_content.len;
}
else if (strcmp (type, "content") == 0) {
if (IS_TEXT_PART_EMPTY (part)) {
@@ -703,8 +703,8 @@ lua_textpart_get_content (lua_State * L)
return 1;
}

start = part->utf_content->data;
len = part->utf_content->len;
start = part->utf_content.begin;
len = part->utf_content.len;
}
else if (strcmp (type, "content_oneline") == 0) {
if (IS_TEXT_PART_EMPTY (part)) {
@@ -809,11 +809,11 @@ lua_textpart_get_length (lua_State * L)
return 1;
}

if (IS_TEXT_PART_EMPTY (part) || part->utf_content == NULL) {
if (IS_TEXT_PART_EMPTY (part) || part->utf_content.len == 0) {
lua_pushinteger (L, 0);
}
else {
lua_pushinteger (L, part->utf_content->len);
lua_pushinteger (L, part->utf_content.len);
}

return 1;

+ 6
- 10
src/lua/lua_parsers.c View File

@@ -206,9 +206,9 @@ lua_parsers_parse_html (lua_State *L)
struct rspamd_lua_text *t;
const gchar *start = NULL;
gsize len;
GByteArray *res, *in;
GByteArray *in;
rspamd_mempool_t *pool;
struct html_content *hc;
void *hc;

if (lua_type (L, 1) == LUA_TUSERDATA) {
t = lua_check_text (L, 1);
@@ -224,19 +224,15 @@ lua_parsers_parse_html (lua_State *L)

if (start != NULL) {
pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0);
hc = rspamd_mempool_alloc0 (pool, sizeof (*hc));
in = g_byte_array_sized_new (len);
g_byte_array_append (in, start, len);

res = rspamd_html_process_part (pool, hc, in);
hc = rspamd_html_process_part(pool, in);

t = lua_newuserdata (L, sizeof (*t));
rspamd_lua_setclass (L, "rspamd{text}", -1);
t->start = res->data;
t->len = res->len;
t->flags = RSPAMD_TEXT_FLAG_OWN;
rspamd_ftok_t res;
rspamd_html_get_parsed_content(hc, &res);
lua_new_text(L, res.begin, res.len, TRUE);

g_byte_array_free (res, FALSE);
g_byte_array_free (in, TRUE);
rspamd_mempool_delete (pool);
}

+ 3
- 3
src/lua/lua_trie.c View File

@@ -375,9 +375,9 @@ lua_trie_search_mime (lua_State *L)

if (trie && task) {
PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) {
if (!IS_TEXT_PART_EMPTY (part) && part->utf_content != NULL) {
text = part->utf_content->data;
len = part->utf_content->len;
if (!IS_TEXT_PART_EMPTY (part) && part->utf_content.len > 0) {
text = part->utf_content.begin;
len = part->utf_content.len;

if (lua_trie_search_str (L, trie, text, len, cb) != 0) {
found = TRUE;

Loading…
Cancel
Save