You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lua_util.extract_specific_urls.lua 9.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. local msg, msg_img
  2. local logger = require "rspamd_logger"
  3. local rspamd_util = require "rspamd_util"
  4. local rspamd_task = require "rspamd_task"
  5. local util = require 'lua_util'
  6. local mpool = require "rspamd_mempool"
  7. local fun = require "fun"
  8. local url = require "rspamd_url"
  9. --[=========[ ******************* message ******************* ]=========]
  10. msg = [[
  11. From: <>
  12. To: <nobody@example.com>
  13. Subject: test
  14. Content-Type: multipart/alternative;
  15. boundary="_000_6be055295eab48a5af7ad4022f33e2d0_"
  16. --_000_6be055295eab48a5af7ad4022f33e2d0_
  17. Content-Type: text/plain; charset="utf-8"
  18. Content-Transfer-Encoding: base64
  19. Hello world
  20. --_000_6be055295eab48a5af7ad4022f33e2d0_
  21. Content-Type: text/html; charset="utf-8"
  22. <html><body>
  23. <a href="http://example.net">http://example.net</a>
  24. <a href="http://example1.net">http://example1.net</a>
  25. <a href="http://example2.net">http://example2.net</a>
  26. <a href="http://example3.net">http://example3.net</a>
  27. <a href="http://example4.net">http://example4.net</a>
  28. <a href="http://domain1.com">http://domain1.com</a>
  29. <a href="http://domain2.com">http://domain2.com</a>
  30. <a href="http://domain3.com">http://domain3.com</a>
  31. <a href="http://domain4.com">http://domain4.com</a>
  32. <a href="http://domain5.com">http://domain5.com</a>
  33. <a href="http://domain.com">http://example.net/</a>
  34. <img src="http://example5.org">hahaha</img>
  35. </html>
  36. ]]
  37. msg_img = [[
  38. From: <>
  39. To: <nobody@example.com>
  40. Subject: test
  41. Content-Type: multipart/alternative;
  42. boundary="_000_6be055295eab48a5af7ad4022f33e2d0_"
  43. --_000_6be055295eab48a5af7ad4022f33e2d0_
  44. Content-Type: text/plain; charset="utf-8"
  45. Content-Transfer-Encoding: base64
  46. Hello world
  47. --_000_6be055295eab48a5af7ad4022f33e2d0_
  48. Content-Type: text/html; charset="utf-8"
  49. <html><body>
  50. <a href="http://example.net">http://example.net</a>
  51. <a href="http://domain.com">http://example.net</a>
  52. <img src="http://example5.org">hahaha</img>
  53. </html>
  54. ]]
  55. local function prepare_actual_result(actual)
  56. return fun.totable(fun.map(
  57. function(u) return u:get_raw():gsub('^%w+://', '') end,
  58. actual
  59. ))
  60. end
  61. context("Lua util - extract_specific_urls plain", function()
  62. local test_helper = require "rspamd_test_helper"
  63. test_helper.init_url_parser()
  64. local task_object = {
  65. urls = {},
  66. cache_set = function(self, ...) end,
  67. cache_get = function(self, ...) end,
  68. get_urls = function(self, need_emails) return self.urls end
  69. }
  70. local url_list = {
  71. "google.com",
  72. "mail.com",
  73. "bizz.com",
  74. "bing.com",
  75. "example.com",
  76. "gov.co.net",
  77. "tesco.co.net",
  78. "domain1.co.net",
  79. "domain2.co.net",
  80. "domain3.co.net",
  81. "domain4.co.net",
  82. "abc.org",
  83. "icq.org",
  84. "meet.org",
  85. "domain1.org",
  86. "domain2.org",
  87. "domain3.org",
  88. "test.com",
  89. }
  90. local cases = {
  91. {expect = url_list, filter = nil, limit = 9999, need_emails = true, prefix = 'p'},
  92. {expect = {}, filter = (function() return false end), limit = 9999, need_emails = true, prefix = 'p'},
  93. {expect = {"domain4.co.net", "test.com", "domain3.org"}, filter = nil, limit = 3, need_emails = true, prefix = 'p'},
  94. {
  95. expect = {"gov.co.net", "tesco.co.net", "domain1.co.net", "domain2.co.net", "domain3.co.net", "domain4.co.net"},
  96. filter = (function(s) return s:get_host():sub(-4) == ".net" end),
  97. limit = 9999,
  98. need_emails = true,
  99. prefix = 'p'
  100. },
  101. {
  102. input = {"a.google.com", "b.google.com", "c.google.com", "a.net", "bb.net", "a.bb.net", "b.bb.net"},
  103. expect = {"a.bb.net", "b.google.com", "a.net", "bb.net", "a.google.com"},
  104. filter = nil,
  105. limit = 9999,
  106. esld_limit = 2,
  107. need_emails = true,
  108. prefix = 'p'
  109. },
  110. {
  111. input = {"abc@a.google.com", "b.google.com", "c.google.com", "a.net", "bb.net", "a.bb.net", "b.bb.net"},
  112. expect = {"abc@a.google.com", "a.bb.net", "b.google.com", "a.net", "bb.net"},
  113. filter = nil,
  114. limit = 9999,
  115. esld_limit = 2,
  116. need_emails = true,
  117. prefix = 'p'
  118. }
  119. }
  120. local pool = mpool.create()
  121. local function prepare_url_list(list)
  122. return fun.totable(fun.map(
  123. function (u) return url.create(pool, u) end,
  124. list or url_list
  125. ))
  126. end
  127. for i,c in ipairs(cases) do
  128. test("extract_specific_urls, backward compatibility case #" .. i, function()
  129. task_object.urls = prepare_url_list(c.input)
  130. if (c.esld_limit) then
  131. -- not awailable in deprecated version
  132. return
  133. end
  134. local actual = util.extract_specific_urls(task_object, c.limit, c.need_emails, c.filter, c.prefix)
  135. local actual_result = prepare_actual_result(actual)
  136. --[[
  137. local s = logger.slog("%1 =?= %2", c.expect, actual_result)
  138. print(s) --]]
  139. assert_rspamd_table_eq_sorted({actual = actual_result, expect = c.expect})
  140. end)
  141. test("extract_specific_urls " .. i, function()
  142. task_object.urls = prepare_url_list(c.input)
  143. local actual = util.extract_specific_urls({
  144. task = task_object,
  145. limit = c.limit,
  146. esld_limit = c.esld_limit,
  147. need_emails = c.need_emails,
  148. filter = c.filter,
  149. prefix = c.prefix,
  150. })
  151. local actual_result = prepare_actual_result(actual)
  152. --[[
  153. local s = logger.slog("case[%1] %2 =?= %3", i, c.expect, actual_result)
  154. print(s) --]]
  155. assert_rspamd_table_eq_sorted({actual = actual_result, expect = c.expect})
  156. end)
  157. end
  158. test("extract_specific_urls, another case", function()
  159. task_object.urls = prepare_url_list {"abc.net", "abc.com", "abc.net", "abc.za.org"}
  160. local actual = util.extract_specific_urls(task_object, 3, true)
  161. local actual_result = prepare_actual_result(actual)
  162. --[[
  163. local s = logger.slog("%1 =?= %2", c.expect, actual_result)
  164. print(s) --]]
  165. local expect = {"abc.com", "abc.net", "abc.za.org"}
  166. assert_rspamd_table_eq_sorted({actual = actual_result, expect = expect})
  167. end)
  168. end)
  169. context("Lua util - extract_specific_urls message", function()
  170. --[[ ******************* kinda functional *************************************** ]]
  171. local test_helper = require "rspamd_test_helper"
  172. local cfg = rspamd_util.config_from_ucl(test_helper.default_config(),
  173. "INIT_URL,INIT_LIBS,INIT_SYMCACHE,INIT_VALIDATE,INIT_PRELOAD_MAPS")
  174. local res,task = rspamd_task.load_from_string(msg, cfg)
  175. if not res then
  176. assert(false, "failed to load message")
  177. end
  178. if not task:process_message() then
  179. assert(false, "failed to process message")
  180. end
  181. test("extract_specific_urls - from email 1 limit", function()
  182. local actual = util.extract_specific_urls({
  183. task = task,
  184. limit = 1,
  185. esld_limit = 1,
  186. })
  187. local actual_result = prepare_actual_result(actual)
  188. --[[
  189. local s = logger.slog("case[%1] %2 =?= %3", i, expect, actual_result)
  190. print(s) --]]
  191. assert_rspamd_table_eq_sorted({actual = actual_result, expect = {"domain.com"}})
  192. end)
  193. test("extract_specific_urls - from email 2 limit", function()
  194. local actual = util.extract_specific_urls({
  195. task = task,
  196. limit = 2,
  197. esld_limit = 1,
  198. })
  199. local actual_result = prepare_actual_result(actual)
  200. --[[
  201. local s = logger.slog("case[%1] %2 =?= %3", i, expect, actual_result)
  202. print(s) --]]
  203. assert_rspamd_table_eq_sorted({actual = actual_result, expect = {"domain.com", "example.net"}})
  204. end)
  205. res,task = rspamd_task.load_from_string(msg_img, rspamd_config)
  206. if not res then
  207. assert_true(false, "failed to load message")
  208. end
  209. if not task:process_message() then
  210. assert_true(false, "failed to process message")
  211. end
  212. test("extract_specific_urls - from email image 1 limit", function()
  213. local actual = util.extract_specific_urls({
  214. task = task,
  215. limit = 1,
  216. esld_limit = 1,
  217. need_images = false,
  218. })
  219. local actual_result = prepare_actual_result(actual)
  220. --[[
  221. local s = logger.slog("case[%1] %2 =?= %3", i, expect, actual_result)
  222. print(s) --]]
  223. assert_rspamd_table_eq_sorted({actual = actual_result, expect = {"domain.com"}})
  224. end)
  225. test("extract_specific_urls - from email image 2 limit", function()
  226. local actual = util.extract_specific_urls({
  227. task = task,
  228. limit = 2,
  229. esld_limit = 1,
  230. need_images = false,
  231. })
  232. local actual_result = prepare_actual_result(actual)
  233. --[[
  234. local s = logger.slog("case[%1] %2 =?= %3", i, expect, actual_result)
  235. print(s) --]]
  236. assert_rspamd_table_eq_sorted({actual = actual_result, expect = {"domain.com", "example.net"}})
  237. end)
  238. test("extract_specific_urls - from email image 3 limit, no images", function()
  239. local actual = util.extract_specific_urls({
  240. task = task,
  241. limit = 3,
  242. esld_limit = 1,
  243. need_images = false,
  244. })
  245. local actual_result = prepare_actual_result(actual)
  246. --[[
  247. local s = logger.slog("case[%1] %2 =?= %3", i, expect, actual_result)
  248. print(s) --]]
  249. assert_rspamd_table_eq_sorted({actual = actual_result, expect = {"domain.com", "example.net"}})
  250. end)
  251. test("extract_specific_urls - from email image 3 limit, has images", function()
  252. local actual = util.extract_specific_urls({
  253. task = task,
  254. limit = 3,
  255. esld_limit = 1,
  256. need_images = true,
  257. })
  258. local actual_result = prepare_actual_result(actual)
  259. --[[
  260. local s = logger.slog("case[%1] %2 =?= %3", i, expect, actual_result)
  261. print(s) --]]
  262. assert_rspamd_table_eq_sorted({actual = actual_result,
  263. expect = {"domain.com", "example.net", "example5.org"}})
  264. end)
  265. test("extract_specific_urls - from email image 2 limit, has images", function()
  266. local actual = util.extract_specific_urls({
  267. task = task,
  268. limit = 2,
  269. esld_limit = 1,
  270. need_images = true,
  271. })
  272. local actual_result = prepare_actual_result(actual)
  273. --[[
  274. local s = logger.slog("case[%1] %2 =?= %3", i, expect, actual_result)
  275. print(s) --]]
  276. assert_rspamd_table_eq_sorted({actual = actual_result,
  277. expect = {"domain.com", "example.net"}})
  278. end)
  279. end)