]> source.dussan.org Git - rspamd.git/commitdiff
Fix html images rules to reduce FP rates
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 16 Feb 2016 12:04:51 +0000 (12:04 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 16 Feb 2016 12:04:51 +0000 (12:04 +0000)
Suggested by: @moisseev
Issue: #525

rules/html.lua
src/libserver/html.c

index c2635a8d0698e5e5bd7528f87e47e80c4a476030..0f26d14219ab010817fdb5b38e21ac4c0ced6b65 100644 (file)
@@ -33,8 +33,14 @@ local function check_html_image(task, min, max)
         local images = hc:get_images()
         if images then
           for _,i in ipairs(images) do
-            if i['embedded'] then
-              return true
+            local tag = i['tag']
+            if tag then
+              local parent = tag:get_parent()
+              if parent then
+                if parent:get_type() == 'a' then
+                  return true
+                end
+              end
             end
           end
         end
@@ -83,8 +89,16 @@ rspamd_config.R_EMPTY_IMAGE = {
 
           if images then -- if there are images
             for _,i in ipairs(images) do -- then iterate over images in the part
-              if i['embedded'] and i['height'] + i['width'] >= 400 then -- if we have a large image
-                return true -- add symbol
+              if i['height'] + i['width'] >= 400 then -- if we have a large image
+                local tag = i['tag']
+                if tag then
+                  local parent = tag:get_parent()
+                  if parent then
+                    if parent:get_type() == 'a' then
+                      return true
+                    end
+                  end
+                end
               end
             end
           end
@@ -112,13 +126,19 @@ rspamd_config.R_SUSPICIOUS_IMAGES = {
 
         if img then
           for _, i in ipairs(img) do
-            if i['embedded'] then
-              local dim = i['width'] + i['height']
-
-              -- do not trigger on small and large images
-              if dim > 100 and dim < 3000 then
-                -- We assume that a single picture 100x200 contains approx 3 words of text
-                pic_words = pic_words + dim / 100
+            local dim = i['width'] + i['height']
+            local tag = i['tag']
+
+            if tag then
+              local parent = tag:get_parent()
+              if parent then
+                if parent:get_type() == 'a' then
+                  -- do not trigger on small and large images
+                  if dim > 100 and dim < 3000 then
+                    -- We assume that a single picture 100x200 contains approx 3 words of text
+                    pic_words = pic_words + dim / 100
+                  end
+                end
               end
             end
           end
index 11dca2c355948036e0d282c7cd967a989b9482b0..07b863b2bd782200fc589954d0390de140661eb0 100644 (file)
@@ -31,7 +31,7 @@ struct html_tag_def {
 
 static struct html_tag_def tag_defs[] = {
        /* W3C defined elements */
-       {Tag_A, "a", (CM_INLINE)},
+       {Tag_A, "a", (0)},
        {Tag_ABBR, "abbr", (CM_INLINE)},
        {Tag_ACRONYM, "acronym", (CM_INLINE)},
        {Tag_ADDRESS, "address", (CM_BLOCK)},
@@ -75,7 +75,7 @@ static struct html_tag_def tag_defs[] = {
        {Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)},
        {Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)},
        {Tag_I, "i", (CM_INLINE)},
-       {Tag_IFRAME, "iframe", (CM_INLINE)},
+       {Tag_IFRAME, "iframe", (0)},
        {Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)},
        {Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)},
        {Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)},
@@ -567,6 +567,7 @@ rspamd_html_tag_by_id (gint id)
        struct html_tag tag;
        struct html_tag_def *found;
 
+       tag.id = id;
        /* Should work as IDs monotonically increase */
        found = bsearch (&tag, tag_defs_num, G_N_ELEMENTS (tag_defs_num),
                                sizeof (tag_defs_num[0]), tag_find_id);