From 51e8a87ce9f7db083b5f3ac74067b08fe9ea5385 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 19 Nov 2014 14:19:50 +0000 Subject: [PATCH] Decode entitles when normalizing HTML parts. --- src/libmime/message.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 382567f77..164a1bd9b 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -42,12 +42,13 @@ strip_html_tags (struct rspamd_task *task, GByteArray * src, gint *stateptr) { - uint8_t *p, *rp, *tbegin = NULL, *end, c, lc; + uint8_t *p, *rp, *tbegin = NULL, *end, c, lc, *estart; gint br, i = 0, depth = 0, in_q = 0; gint state = 0; + guint dlen; GByteArray *buf; GNode *level_ptr = NULL; - gboolean erase = FALSE; + gboolean erase = FALSE, html_decode = FALSE; if (stateptr) state = *stateptr; @@ -204,6 +205,25 @@ unbreak_tag: } break; + case '&': + /* Decode entitle */ + html_decode = TRUE; + estart = rp; + goto reg_char; + break; + + case ';': + if (html_decode) { + html_decode = FALSE; + *rp = ';'; + if (rp - estart > 0) { + dlen = rp - estart + 1; + decode_entitles (estart, &dlen); + rp = estart + dlen; + } + } + break; + case '?': if (state == 1 && *(p - 1) == '<') { @@ -211,7 +231,6 @@ unbreak_tag: state = 2; break; } - case 'E': case 'e': /* !DOCTYPE exception */ @@ -226,7 +245,6 @@ unbreak_tag: break; } /* fall-through */ - case 'l': /* swm: If we encounter '