From: Vsevolod Stakhov Date: Tue, 16 Feb 2016 12:04:51 +0000 (+0000) Subject: Fix html images rules to reduce FP rates X-Git-Tag: 1.2.0~224 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=bd9428b14979c7f06cc7871498e93f1893f1ec4f;p=rspamd.git Fix html images rules to reduce FP rates Suggested by: @moisseev Issue: #525 --- diff --git a/rules/html.lua b/rules/html.lua index c2635a8d0..0f26d1421 100644 --- a/rules/html.lua +++ b/rules/html.lua @@ -33,8 +33,14 @@ local function check_html_image(task, min, max) local images = hc:get_images() if images then for _,i in ipairs(images) do - if i['embedded'] then - return true + local tag = i['tag'] + if tag then + local parent = tag:get_parent() + if parent then + if parent:get_type() == 'a' then + return true + end + end end end end @@ -83,8 +89,16 @@ rspamd_config.R_EMPTY_IMAGE = { if images then -- if there are images for _,i in ipairs(images) do -- then iterate over images in the part - if i['embedded'] and i['height'] + i['width'] >= 400 then -- if we have a large image - return true -- add symbol + if i['height'] + i['width'] >= 400 then -- if we have a large image + local tag = i['tag'] + if tag then + local parent = tag:get_parent() + if parent then + if parent:get_type() == 'a' then + return true + end + end + end end end end @@ -112,13 +126,19 @@ rspamd_config.R_SUSPICIOUS_IMAGES = { if img then for _, i in ipairs(img) do - if i['embedded'] then - local dim = i['width'] + i['height'] - - -- do not trigger on small and large images - if dim > 100 and dim < 3000 then - -- We assume that a single picture 100x200 contains approx 3 words of text - pic_words = pic_words + dim / 100 + local dim = i['width'] + i['height'] + local tag = i['tag'] + + if tag then + local parent = tag:get_parent() + if parent then + if parent:get_type() == 'a' then + -- do not trigger on small and large images + if dim > 100 and dim < 3000 then + -- We assume that a single picture 100x200 contains approx 3 words of text + pic_words = pic_words + dim / 100 + end + end end end end diff --git a/src/libserver/html.c b/src/libserver/html.c index 11dca2c35..07b863b2b 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -31,7 +31,7 @@ struct html_tag_def { static struct html_tag_def tag_defs[] = { /* W3C defined elements */ - {Tag_A, "a", (CM_INLINE)}, + {Tag_A, "a", (0)}, {Tag_ABBR, "abbr", (CM_INLINE)}, {Tag_ACRONYM, "acronym", (CM_INLINE)}, {Tag_ADDRESS, "address", (CM_BLOCK)}, @@ -75,7 +75,7 @@ static struct html_tag_def tag_defs[] = { {Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)}, {Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)}, {Tag_I, "i", (CM_INLINE)}, - {Tag_IFRAME, "iframe", (CM_INLINE)}, + {Tag_IFRAME, "iframe", (0)}, {Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)}, {Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)}, {Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)}, @@ -567,6 +567,7 @@ rspamd_html_tag_by_id (gint id) struct html_tag tag; struct html_tag_def *found; + tag.id = id; /* Should work as IDs monotonically increase */ found = bsearch (&tag, tag_defs_num, G_N_ELEMENTS (tag_defs_num), sizeof (tag_defs_num[0]), tag_find_id);