]> source.dussan.org Git - rspamd.git/commitdiff
Start work on new HTML rules.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 23 Jul 2015 15:11:49 +0000 (16:11 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 23 Jul 2015 15:11:49 +0000 (16:11 +0100)
conf/lua/html.lua [new file with mode: 0644]
conf/lua/regexp/headers.lua
conf/lua/rspamd.lua
conf/metrics.conf

diff --git a/conf/lua/html.lua b/conf/lua/html.lua
new file mode 100644 (file)
index 0000000..bd7abd9
--- /dev/null
@@ -0,0 +1,53 @@
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to you under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at:
+-- 
+--     http://www.apache.org/licenses/LICENSE-2.0
+-- 
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+
+local reconf = config['regexp']
+local rspamd_regexp = require "rspamd_regexp"
+
+-- Messages that have only HTML part
+reconf['MIME_HTML_ONLY'] = 'has_only_html_part()'
+
+local function check_html_image(task, min, max)
+  local tp = task:get_text_parts()
+  
+  for _,p in ipairs(tp) do
+    if p:is_html() then
+      local hc = p:get_html()
+      local len = p:get_raw_length()
+      
+      if len >= min and len < max then
+        local images = hc:get_images()
+        
+        if images then
+          for _,i in ipairs(images) do
+            if i['embedded'] then
+              return true
+            end
+          end
+        end
+      end
+    end
+  end
+end
+
+rspamd_config.HTML_SHORT_LINK_IMG_1 = function(task)
+  return check_html_image(task, 0, 1024)
+end
+rspamd_config.HTML_SHORT_LINK_IMG_2 = function(task)
+  return check_html_image(task, 1024, 1536)
+end
+rspamd_config.HTML_SHORT_LINK_IMG_3 = function(task)
+  return check_html_image(task, 1536, 2048)
+end
\ No newline at end of file
index e6f079e860629b6b26f22d862fd42116aa72de4b..e8bc7af448f632618dc222201c77958fe9020e10 100644 (file)
@@ -70,10 +70,6 @@ reconf['R_MISSING_CHARSET']= string.format('content_type_is_type(text) & !conten
 -- Subject seems to be spam
 reconf['R_SAJDING'] = 'Subject=/\\bsajding(?:om|a)?\\b/iH'
 
--- Messages that have only HTML part
-reconf['MIME_HTML_ONLY'] = 'has_only_html_part()'
-
-
 -- Find forged Outlook MUA 
 -- Yahoo groups messages
 local yahoo_bulk = 'Received=/from \\[\\S+\\] by \\S+\\.(?:groups|scd|dcn)\\.yahoo\\.com with NNFMP/H'
index df480a72cea34f4b0c9d2c754f3522c68f90d1cd..bb1709c6922e33775f5608079c882324ba07627f 100644 (file)
@@ -32,16 +32,13 @@ dofile('regexp/headers.lua')
 dofile('regexp/lotto.lua')
 dofile('regexp/fraud.lua')
 dofile('regexp/drugs.lua')
+dofile('html.lua')
 
 local reconf = config['regexp']
 local util = require "rspamd_util"
 
 -- Uncategorized rules
 
-local html_length_1024_1536 = 'has_content_part_len(\'text\', \'html\', 1024, 1536)'
-local html_link_image = '/<img /iPr'
-reconf['HTML_SHORT_LINK_IMG_2'] = string.format('(%s) & (%s)', html_length_1024_1536, html_link_image)
-
 -- Local rules
 local r_bgcolor = '/BGCOLOR=/iP'
 local r_font_color = '/font color=[\\"\']?\\#FFFFFF[\\"\']?/iP'
index 95826765ed91b6ff00b5eb610be265dcdfe9705a..b3fc30eca6ac34d926e9525f5eb59d2ada00785a 100644 (file)
@@ -430,8 +430,18 @@ metric {
        symbol {
            weight = 3.0;
            description = "Short html part with a link to an image";
+           name = "HTML_SHORT_LINK_IMG_1";
+       }
+       symbol {
+           weight = 1.0;
+           description = "Short html part with a link to an image";
            name = "HTML_SHORT_LINK_IMG_2";
        }
+       symbol {
+           weight = 0.5;
+           description = "Short html part with a link to an image";
+           name = "HTML_SHORT_LINK_IMG_3";
+       }
        symbol {
            weight = 5.0;
            description = "Suspicious boundary in header Content-Type";