Merge pull request #2385 from negram/extract_specific_urls-rework

Extract specific urls rework
author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-07-31 19:26:08 +0100
committer: GitHub <noreply@github.com> 2018-07-31 19:26:08 +0100
commit: 111fb4d10cc15830e52611b663b2dbf5d8571b21 (patch)
tree: 1fac9cd11bfd288fd1463229a67c6234b7d74111
parent: f17932a8e0c6f1759651e00524678f43c2cd0293 (diff)
parent: 1003d2725ca467b3b080c2bedd3dd06068673573 (diff)
download: rspamd-111fb4d10cc15830e52611b663b2dbf5d8571b21.tar.gz
rspamd-111fb4d10cc15830e52611b663b2dbf5d8571b21.zip
4 files changed, 407 insertions, 32 deletions
diff --git a/lualib/lua_util.lua b/lualib/lua_util.lua
index 520318fe6..ba5843ff6 100644
--- a/lualib/lua_util.lua
+++ b/lualib/lua_util.lua
@@ -490,36 +490,71 @@ end
 exports.override_defaults = override_defaults
 
 --[[[
--- @function lua_util.extract_specific_urls(task, limit, [need_emails[, filter[, prefix])
+-- @function lua_util.extract_specific_urls(params)
+-- params: {
+- - task
+- - limit <int> (default = 9999)
+- - esld_limit <int> (default = 9999) n domains per eSLD (effective second level domain)
+                                      works only if number of unique eSLD less than `limit`
+- - need_emails <bool> (default = false)
+- - filter <callback> (default = nil)
+- - prefix <string> cache prefix (default = nil)
+-- }
 -- Apply heuristic in extracting of urls from task, this function
 -- tries its best to extract specific number of urls from a task based on
 -- their characteristics
 --]]
-exports.extract_specific_urls = function(task, lim, need_emails, filter, prefix)
+-- exports.extract_specific_urls = function(params_or_task, limit, need_emails, filter, prefix)
+exports.extract_specific_urls = function(params_or_task, lim, need_emails, filter, prefix)
+  local default_params = {
+    limit = 9999,
+    esld_limit = 9999,
+    need_emails = false,
+    filter = nil,
+    prefix = nil
+  }
+
+  local params
+  if type(params_or_task) == 'table' and type(lim) == 'nil' then
+    params = params_or_task
+  else
+    -- Deprecated call
+    params = {
+      task = params_or_task,
+      limit = lim,
+      need_emails = need_emails,
+      filter = filter,
+      prefix = prefix
+    }
+  end
+  for k,v in pairs(default_params) do
+    if not params[k] then params[k] = v end
+  end
+
+
   local cache_key
 
-  if prefix then
-    cache_key = prefix
+  if params.prefix then
+    cache_key = params.prefix
   else
-    cache_key = string.format('sp_urls_%d%s', lim, need_emails)
+    cache_key = string.format('sp_urls_%d%s', params.limit, params.need_emails)
   end
 
 
-  local cached = task:cache_get(cache_key)
+  local cached = params.task:cache_get(cache_key)
 
   if cached then
     return cached
   end
 
-  local urls = task:get_urls(need_emails)
+  local urls = params.task:get_urls(params.need_emails)
 
   if not urls then return {} end
 
-  if filter then urls = fun.totable(fun.filter(filter, urls)) end
-
-  if #urls <= lim then
-    task:cache_set(cache_key, urls)
+  if params.filter then urls = fun.totable(fun.filter(params.filter, urls)) end
 
+  if #urls <= params.limit and #urls <= params.esld_limit then
+    params.task:cache_set(cache_key, urls)
     return urls
   end
 
@@ -538,7 +573,9 @@ exports.extract_specific_urls = function(task, lim, need_emails, filter, prefix)
         eslds[esld] = {u}
         neslds = neslds + 1
       else
-        table.insert(eslds[esld], u)
+        if #eslds[esld] < params.esld_limit then
+          table.insert(eslds[esld], u)
+        end
       end
 
       local parts = rspamd_str_split(esld, '.')
@@ -558,7 +595,7 @@ exports.extract_specific_urls = function(task, lim, need_emails, filter, prefix)
         else
           if u:get_user() then
             table.insert(res, u)
-          elseif u:is_subject() then
+          elseif u:is_subject() or u:is_phished() then
             table.insert(res, u)
           end
         end
@@ -566,35 +603,40 @@ exports.extract_specific_urls = function(task, lim, need_emails, filter, prefix)
     end
   end
 
-  lim = lim - #res
-  if lim <= 0 then lim = 1 end
+  local limit = params.limit
+  limit = limit - #res
+  if limit <= 0 then limit = 1 end
 
-  if neslds <= lim then
+  if neslds <= limit then
     -- We can get urls based on their eslds
-    while lim > 0 do
+    repeat
+      local item_found = false
+
       for _,lurls in pairs(eslds) do
         if #lurls > 0 then
           table.insert(res, table.remove(lurls))
-          lim = lim - 1
+          limit = limit - 1
+          item_found = true
         end
       end
-    end
 
-    task:cache_set(cache_key, urls)
+    until limit <= 0 or not item_found
+
+    params.task:cache_set(cache_key, urls)
     return res
   end
 
-  if ntlds <= lim then
-    while lim > 0 do
+  if ntlds <= limit then
+    while limit > 0 do
       for _,lurls in pairs(tlds) do
         if #lurls > 0 then
           table.insert(res, table.remove(lurls))
-          lim = lim - 1
+          limit = limit - 1
         end
       end
     end
 
-    task:cache_set(cache_key, urls)
+    params.task:cache_set(cache_key, urls)
     return res
   end
 
@@ -611,14 +653,14 @@ exports.extract_specific_urls = function(task, lim, need_emails, filter, prefix)
     local tld2 = tlds[tlds_keys[ntlds - i]]
     table.insert(res, table.remove(tld1))
     table.insert(res, table.remove(tld2))
-    lim = lim - 2
+    limit = limit - 2
 
-    if lim <= 0 then
+    if limit <= 0 then
       break
     end
   end
 
-  task:cache_set(cache_key, urls)
+  params.task:cache_set(cache_key, urls)
   return res
 end
 
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 944ab9dbd..e11a982cd 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -53,6 +53,12 @@ end
  * @return {boolean},{rspamd_task|error} status + new task or error message
  */
 LUA_FUNCTION_DEF (task, load_from_file);
+/***
+ * @function rspamd_task.load_from_string(message[, cfg])
+ * Loads a message from specific file
+ * @return {boolean},{rspamd_task|error} status + new task or error message
+ */
+LUA_FUNCTION_DEF (task, load_from_string);
 
 LUA_FUNCTION_DEF (task, get_message);
 /***
@@ -908,6 +914,7 @@ LUA_FUNCTION_DEF (task, get_stat_tokens);
 
 static const struct luaL_reg tasklib_f[] = {
 	LUA_INTERFACE_DEF (task, load_from_file),
+	LUA_INTERFACE_DEF (task, load_from_string),
 	{NULL, NULL}
 };
 
@@ -1237,7 +1244,7 @@ lua_task_unmap_dtor (gpointer p)
 	}
 }
 
-static int
+static gint
 lua_task_load_from_file (lua_State * L)
 {
 	struct rspamd_task *task = NULL, **ptask;
@@ -1295,6 +1302,56 @@ lua_task_load_from_file (lua_State * L)
 	return 2;
 }
 
+static gint
+lua_task_load_from_string (lua_State * L)
+{
+	struct rspamd_task *task = NULL, **ptask;
+	const gchar *str_message = luaL_checkstring (L, 1), *err = NULL;
+	gsize message_len = lua_strlen (L, 1);
+	struct rspamd_config *cfg = NULL;
+	gboolean res = FALSE;
+
+	if (str_message) {
+
+		if (lua_type (L, 2) == LUA_TUSERDATA) {
+			gpointer p;
+			p = rspamd_lua_check_udata_maybe (L, 2, "rspamd{config}");
+
+			if (p) {
+				cfg = *(struct rspamd_config **)p;
+			}
+		}
+
+		task = rspamd_task_new (NULL, cfg, NULL, NULL);
+		task->msg.begin = str_message;
+		task->msg.len   = message_len;
+		rspamd_mempool_add_destructor (task->task_pool,
+									   lua_task_unmap_dtor, task);
+		res = TRUE;
+	}
+	else {
+		return luaL_error (L, "invalid arguments");
+	}
+
+	lua_pushboolean (L, res);
+
+	if (res) {
+		ptask = lua_newuserdata (L, sizeof (*ptask));
+		*ptask = task;
+		rspamd_lua_setclass (L, "rspamd{task}", -1);
+	}
+	else {
+		if (err) {
+			lua_pushstring (L, err);
+		}
+		else {
+			lua_pushnil (L);
+		}
+	}
+
+	return 2;
+}
+
 static int
 lua_task_get_mempool (lua_State * L)
 {
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 063a7aab7..af1c13ad2 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -45,7 +45,7 @@ LUA_FUNCTION_DEF (util, create_event_base);
  */
 LUA_FUNCTION_DEF (util, load_rspamd_config);
 /***
- * @function util.config_from_ucl(any)
+ * @function util.config_from_ucl(any, string)
  * Load rspamd config from ucl reperesented by any lua table
  * @return {confg} new configuration object suitable for access
  */
@@ -666,17 +666,67 @@ lua_util_load_rspamd_config (lua_State *L)
 }
 
 static gint
+parse_config_options (const char *str_options)
+{
+	gint ret = 0;
+	gchar **vec;
+	const gchar *str;
+	guint i, l;
+
+	vec = g_strsplit_set (str_options, ",;", -1);
+	if (vec) {
+		l = g_strv_length (vec);
+		for (i = 0; i < l; i ++) {
+			str = vec[i];
+
+			if (g_ascii_strcasecmp (str, "INIT_URL") == 0) {
+				ret |= RSPAMD_CONFIG_INIT_URL;
+			} else if (g_ascii_strcasecmp (str, "INIT_LIBS") == 0) {
+				ret |= RSPAMD_CONFIG_INIT_LIBS;
+			} else if (g_ascii_strcasecmp (str, "INIT_SYMCACHE") == 0) {
+				ret |= RSPAMD_CONFIG_INIT_SYMCACHE;
+			} else if (g_ascii_strcasecmp (str, "INIT_VALIDATE") == 0) {
+				ret |= RSPAMD_CONFIG_INIT_VALIDATE;
+			} else if (g_ascii_strcasecmp (str, "INIT_NO_TLD") == 0) {
+				ret |= RSPAMD_CONFIG_INIT_NO_TLD;
+			} else if (g_ascii_strcasecmp (str, "INIT_PRELOAD_MAPS") == 0) {
+				ret |= RSPAMD_CONFIG_INIT_PRELOAD_MAPS;
+			} else {
+				msg_warn ("bad type: %s", str);
+			}
+		}
+
+		g_strfreev (vec);
+	}
+
+	return ret;
+}
+
+static gint
 lua_util_config_from_ucl (lua_State *L)
 {
-	struct rspamd_config *cfg, **pcfg;
+	struct rspamd_config *cfg = NULL, **pcfg;
 	struct rspamd_rcl_section *top;
 	GError *err = NULL;
 	ucl_object_t *obj;
+	const char *str_options = NULL;
+	gint int_options = 0;
+
 
 	obj = ucl_object_lua_import (L, 1);
+	if (lua_gettop (L) == 2) {
+		if (lua_type (L, 2) == LUA_TSTRING) {
+			str_options = lua_tostring (L, 2);
+			int_options = parse_config_options(str_options);
+		}
+		else {
+			msg_err_config ("config_from_ucl: second parameter is expected to be string");
+			ucl_object_unref (obj);
+			lua_pushnil (L);
+		}
+	}
 
 	if (obj) {
-		cfg = g_malloc0 (sizeof (struct rspamd_config));
 		cfg = rspamd_config_new (RSPAMD_CONFIG_INIT_SKIP_LUA);
 		cfg->lua_state = L;
 
@@ -690,7 +740,7 @@ lua_util_config_from_ucl (lua_State *L)
 			lua_pushnil (L);
 		}
 		else {
-			rspamd_config_post_load (cfg, 0);
+			rspamd_config_post_load (cfg, int_options);
 			pcfg = lua_newuserdata (L, sizeof (struct rspamd_config *));
 			rspamd_lua_setclass (L, "rspamd{config}", -1);
 			*pcfg = cfg;
diff --git a/test/lua/unit/lua_util.extract_specific_urls.lua b/test/lua/unit/lua_util.extract_specific_urls.lua
new file mode 100644
index 000000000..9c8e4e187
--- /dev/null
+++ b/test/lua/unit/lua_util.extract_specific_urls.lua
@@ -0,0 +1,226 @@
+context("Lua util - extract_specific_urls", function()
+  local util  = require 'lua_util'
+  local mpool = require "rspamd_mempool"
+  local fun   = require "fun"
+  local url   = require "rspamd_url"
+  local logger = require "rspamd_logger"
+  local ffi = require "ffi"
+  local rspamd_util = require "rspamd_util"
+  local rspamd_task = require "rspamd_task"
+
+  ffi.cdef[[
+  void rspamd_url_init (const char *tld_file);
+  unsigned ottery_rand_range(unsigned top);
+  void rspamd_http_normalize_path_inplace(char *path, size_t len, size_t *nlen);
+  ]]
+
+  local test_dir = string.gsub(debug.getinfo(1).source, "^@(.+/)[^/]+$", "%1")
+
+  ffi.C.rspamd_url_init(string.format('%s/%s', test_dir, "test_tld.dat"))
+
+  local task_object = {
+    urls      = {},
+    cache_set = function(self, ...) end,
+    cache_get = function(self, ...) end,
+    get_urls  = function(self, need_emails) return self.urls end
+  }
+
+  local url_list = {
+    "google.com",
+    "mail.com",
+    "bizz.com",
+    "bing.com",
+    "example.com",
+    "gov.co.net",
+    "tesco.co.net",
+    "domain1.co.net",
+    "domain2.co.net",
+    "domain3.co.net",
+    "domain4.co.net",
+    "abc.org",
+    "icq.org",
+    "meet.org",
+    "domain1.org",
+    "domain2.org",
+    "domain3.org",
+    "domain3.org",
+    "test.com",
+  }
+
+  local cases = {
+    {expect = url_list, filter = nil, limit = 9999, need_emails = true, prefix = 'p'},
+    {expect = {}, filter = (function() return false end), limit = 9999, need_emails = true, prefix = 'p'},
+    {expect = {"domain4.co.net", "test.com"}, filter = nil, limit = 2, need_emails = true, prefix = 'p'},
+    {
+      expect = {"gov.co.net", "tesco.co.net", "domain1.co.net", "domain2.co.net", "domain3.co.net", "domain4.co.net"},
+      filter = (function(s) return s:get_host():sub(-4) == ".net" end),
+      limit = 9999,
+      need_emails = true,
+      prefix = 'p'
+    },
+    {
+      input  = {"a.google.com", "b.google.com", "c.google.com", "a.net", "bb.net", "a.bb.net", "b.bb.net"},
+      expect = {"a.bb.net", "b.google.com", "a.net", "bb.net", "a.google.com"},
+      filter = nil,
+      limit = 9999,
+      esld_limit = 2,
+      need_emails = true,
+      prefix = 'p'
+    },
+    {
+      input  = {"abc@a.google.com", "b.google.com", "c.google.com", "a.net", "bb.net", "a.bb.net", "b.bb.net"},
+      expect = {"abc@a.google.com", "a.bb.net", "b.google.com", "a.net", "bb.net", "abc@a.google.com"},
+      filter = nil,
+      limit = 9999,
+      esld_limit = 2,
+      need_emails = true,
+      prefix = 'p'
+    }
+  }
+
+  local function prepare_actual_result(actual)
+    return fun.totable(fun.map(
+      function(u) return u:get_raw():gsub('^%w+://', '') end,
+      actual
+    ))
+  end
+
+  local pool = mpool.create()
+
+  for i,c in ipairs(cases) do
+
+    local function prepare_url_list(c)
+      return fun.totable(fun.map(
+    function (u) return url.create(pool, u) end,
+    c.input or url_list
+    ))
+  end
+
+    test("extract_specific_urls, backward compatibility case #" .. i, function()
+      task_object.urls = prepare_url_list(c)
+      if (c.esld_limit) then
+        -- not awailable in deprecated version
+        return
+      end
+      local actual = util.extract_specific_urls(task_object, c.limit, c.need_emails, c.filter, c.prefix)
+
+      local actual_result = prepare_actual_result(actual)
+
+      --[[
+        local s = logger.slog("%1 =?= %2", c.expect, actual_result)
+        print(s) --]]
+
+      assert_equal(true, util.table_cmp(c.expect, actual_result), "checking that we got the same tables")
+
+    end)
+
+    test("extract_specific_urls " .. i, function()
+      task_object.urls = prepare_url_list(c)
+
+      local actual = util.extract_specific_urls({
+        task = task_object,
+        limit = c.limit,
+        esld_limit = c.esld_limit,
+        need_emails = c.need_emails,
+        filter = c.filter,
+        prefix = c.prefix,
+      })
+
+      local actual_result = prepare_actual_result(actual)
+
+      --[[
+        local s = logger.slog("case[%1] %2 =?= %3", i, c.expect, actual_result)
+        print(s) --]]
+
+      assert_equal(true, util.table_cmp(c.expect, actual_result), "checking that we got the same tables")
+
+    end)
+  end
+
+--[[ ******************* kinda functional *************************************** ]]
+  local test_dir = string.gsub(debug.getinfo(1).source, "^@(.+/)[^/]+$", "%1")
+  local tld_file = string.format('%s/%s', test_dir, "test_tld.dat")
+
+  local config = {
+    options = {
+      filters = {'spf', 'dkim', 'regexp'},
+      url_tld = tld_file,
+      dns = {
+        nameserver = {'8.8.8.8'}
+      },
+    },
+    logging = {
+      type = 'console',
+      level = 'debug'
+    },
+    metric = {
+      name = 'default',
+      actions = {
+        reject = 100500,
+      },
+      unknown_weight = 1
+    }
+  }
+
+  test("extract_specific_urls - from email", function()
+    local cfg = rspamd_util.config_from_ucl(config, "INIT_URL,INIT_LIBS,INIT_SYMCACHE,INIT_VALIDATE,INIT_PRELOAD_MAPS")
+    assert_not_nil(cfg)
+
+    local msg = [[
+From: <>
+To: <nobody@example.com>
+Subject: test
+Content-Type: multipart/alternative;
+    boundary="_000_6be055295eab48a5af7ad4022f33e2d0_"
+
+--_000_6be055295eab48a5af7ad4022f33e2d0_
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: base64
+
+Hello world
+
+
+--_000_6be055295eab48a5af7ad4022f33e2d0_
+Content-Type: text/html; charset="utf-8"
+
+<html><body>
+<a href="http://example.net">http://example.net</a>
+<a href="http://example1.net">http://example1.net</a>
+<a href="http://example2.net">http://example2.net</a>
+<a href="http://example3.net">http://example3.net</a>
+<a href="http://example4.net">http://example4.net</a>
+<a href="http://domain1.com">http://domain1.com</a>
+<a href="http://domain2.com">http://domain2.com</a>
+<a href="http://domain3.com">http://domain3.com</a>
+<a href="http://domain4.com">http://domain4.com</a>
+<a href="http://domain5.com">http://domain5.com</a>
+<a href="http://domain.com">http://example.net/</a>
+</html>
+]]
+    local expect = {"example.net", "domain.com"}
+    local res,task = rspamd_task.load_from_string(msg, rspamd_config)
+
+    if not res then
+      assert_true(false, "failed to load message")
+    end
+
+    if not task:process_message() then
+      assert_true(false, "failed to process message")
+    end
+
+    local actual = util.extract_specific_urls({
+      task = task,
+      limit = 2,
+      esld_limit = 2,
+    })
+
+    local actual_result = prepare_actual_result(actual)
+
+    --[[
+      local s = logger.slog("case[%1] %2 =?= %3", i, expect, actual_result)
+      print(s) --]]
+
+    assert_equal("domain.com", actual_result[1], "checking that first url is the one with highest suspiciousness level")
+
+  end)
+end)
+\ No newline at end of file
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-07-31 19:26:08 +0100
committer	GitHub <noreply@github.com>	2018-07-31 19:26:08 +0100
commit	111fb4d10cc15830e52611b663b2dbf5d8571b21 (patch)
tree	1fac9cd11bfd288fd1463229a67c6234b7d74111
parent	f17932a8e0c6f1759651e00524678f43c2cd0293 (diff)
parent	1003d2725ca467b3b080c2bedd3dd06068673573 (diff)
download	rspamd-111fb4d10cc15830e52611b663b2dbf5d8571b21.tar.gz rspamd-111fb4d10cc15830e52611b663b2dbf5d8571b21.zip