You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.lua 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. -- Licensed to the Apache Software Foundation (ASF) under one or more
  2. -- contributor license agreements. See the NOTICE file distributed with
  3. -- this work for additional information regarding copyright ownership.
  4. -- The ASF licenses this file to you under the Apache License, Version 2.0
  5. -- (the "License"); you may not use this file except in compliance with
  6. -- the License. You may obtain a copy of the License at:
  7. --
  8. -- http://www.apache.org/licenses/LICENSE-2.0
  9. --
  10. -- Unless required by applicable law or agreed to in writing, software
  11. -- distributed under the License is distributed on an "AS IS" BASIS,
  12. -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. -- See the License for the specific language governing permissions and
  14. -- limitations under the License.
  15. local reconf = config['regexp']
  16. -- Messages that have only HTML part
  17. reconf['MIME_HTML_ONLY'] = {
  18. re = 'has_only_html_part()',
  19. score = 0.2,
  20. description = 'Messages that have only HTML part',
  21. group = 'headers'
  22. }
  23. local function check_html_image(task, min, max)
  24. local tp = task:get_text_parts()
  25. for _,p in ipairs(tp) do
  26. if p:is_html() then
  27. local hc = p:get_html()
  28. local len = p:get_length()
  29. if hc and len >= min and len < max then
  30. local images = hc:get_images()
  31. if images then
  32. for _,i in ipairs(images) do
  33. local tag = i['tag']
  34. if tag then
  35. local parent = tag:get_parent()
  36. if parent then
  37. if parent:get_type() == 'a' then
  38. -- do not trigger on small and unknown size images
  39. if i['height'] + i['width'] >= 210 or not i['embedded'] then
  40. return true
  41. end
  42. end
  43. end
  44. end
  45. end
  46. end
  47. end
  48. end
  49. end
  50. end
  51. rspamd_config.HTML_SHORT_LINK_IMG_1 = {
  52. callback = function(task)
  53. return check_html_image(task, 0, 1024)
  54. end,
  55. score = 2.0,
  56. group = 'html',
  57. description = 'Short html part (0..1K) with a link to an image'
  58. }
  59. rspamd_config.HTML_SHORT_LINK_IMG_2 = {
  60. callback = function(task)
  61. return check_html_image(task, 1024, 1536)
  62. end,
  63. score = 1.0,
  64. group = 'html',
  65. description = 'Short html part (1K..1.5K) with a link to an image'
  66. }
  67. rspamd_config.HTML_SHORT_LINK_IMG_3 = {
  68. callback = function(task)
  69. return check_html_image(task, 1536, 2048)
  70. end,
  71. score = 0.5,
  72. group = 'html',
  73. description = 'Short html part (1.5K..2K) with a link to an image'
  74. }
  75. rspamd_config.R_EMPTY_IMAGE = {
  76. callback = function(task)
  77. local tp = task:get_text_parts() -- get text parts in a message
  78. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  79. if p:is_html() then -- if the current part is html part
  80. local hc = p:get_html() -- we get HTML context
  81. local len = p:get_length() -- and part's length
  82. if hc and len < 50 then -- if we have a part that has less than 50 bytes of text
  83. local images = hc:get_images() -- then we check for HTML images
  84. if images then -- if there are images
  85. for _,i in ipairs(images) do -- then iterate over images in the part
  86. if i['height'] + i['width'] >= 400 then -- if we have a large image
  87. local tag = i['tag']
  88. if tag then
  89. local parent = tag:get_parent()
  90. if parent then
  91. if parent:get_type() ~= 'a' then
  92. return true
  93. end
  94. end
  95. end
  96. end
  97. end
  98. end
  99. end
  100. end
  101. end
  102. end,
  103. score = 2.0,
  104. group = 'html',
  105. description = 'Message contains empty parts and image'
  106. }
  107. rspamd_config.R_SUSPICIOUS_IMAGES = {
  108. callback = function(task)
  109. local tp = task:get_text_parts() -- get text parts in a message
  110. for _, p in ipairs(tp) do
  111. local h = p:get_html()
  112. if h then
  113. local l = p:get_words_count()
  114. local img = h:get_images()
  115. local pic_words = 0
  116. if img then
  117. for _, i in ipairs(img) do
  118. local dim = i['width'] + i['height']
  119. local tag = i['tag']
  120. if tag then
  121. local parent = tag:get_parent()
  122. if parent then
  123. if parent:get_type() == 'a' then
  124. -- do not trigger on small and large images
  125. if dim > 100 and dim < 3000 then
  126. -- We assume that a single picture 100x200 contains approx 3 words of text
  127. pic_words = pic_words + dim / 100
  128. end
  129. end
  130. end
  131. end
  132. end
  133. end
  134. if l + pic_words > 0 then
  135. local rel = pic_words / (l + pic_words)
  136. if rel > 0.5 then
  137. return true, (rel - 0.5) * 2
  138. end
  139. end
  140. end
  141. end
  142. return false
  143. end,
  144. score = 5.0,
  145. group = 'html',
  146. description = 'Message contains many suspicious messages'
  147. }
  148. local vis_check_id = rspamd_config:register_symbol{
  149. name = 'HTML_VISIBLE_CHECKS',
  150. type = 'callback',
  151. callback = function(task)
  152. --local logger = require "rspamd_logger"
  153. local tp = task:get_text_parts() -- get text parts in a message
  154. local ret = false
  155. local diff = 0.0
  156. local transp_rate = 0
  157. local invisible_blocks = 0
  158. local zero_size_blocks = 0
  159. local arg
  160. local normal_len = 0
  161. local transp_len = 0
  162. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  163. normal_len = normal_len + p:get_length()
  164. if p:is_html() and p:get_html() then -- if the current part is html part
  165. local hc = p:get_html() -- we get HTML context
  166. hc:foreach_tag({'font', 'span', 'div', 'p', 'td'}, function(tag)
  167. local bl = tag:get_extra()
  168. if bl then
  169. if not bl['visible'] then
  170. invisible_blocks = invisible_blocks + 1
  171. end
  172. if bl['font_size'] and bl['font_size'] == 0 then
  173. zero_size_blocks = zero_size_blocks + 1
  174. end
  175. if bl['bgcolor'] and bl['color'] and bl['visible'] then
  176. local color = bl['color']
  177. local bgcolor = bl['bgcolor']
  178. -- Should use visual approach here some day
  179. local diff_r = math.abs(color[1] - bgcolor[1])
  180. local diff_g = math.abs(color[2] - bgcolor[2])
  181. local diff_b = math.abs(color[3] - bgcolor[3])
  182. local r_avg = (color[1] + bgcolor[1]) / 2.0
  183. -- Square
  184. diff_r = diff_r * diff_r
  185. diff_g = diff_g * diff_g
  186. diff_b = diff_b * diff_b
  187. diff = math.sqrt(2*diff_r + 4*diff_g + 3 * diff_b +
  188. (r_avg * (diff_r - diff_b) / 256.0))
  189. diff = diff / 256.0
  190. if diff < 0.1 then
  191. ret = true
  192. local content_len = #(tag:get_content() or {})
  193. invisible_blocks = invisible_blocks + 1 -- This block is invisible
  194. transp_len = transp_len + content_len * (0.1 - diff) * 10.0
  195. normal_len = normal_len - content_len
  196. local tr = transp_len / (normal_len + transp_len)
  197. if tr > transp_rate then
  198. transp_rate = tr
  199. arg = string.format('%s color #%x%x%x bgcolor #%x%x%x',
  200. tostring(tag:get_type()),
  201. color[1], color[2], color[3],
  202. bgcolor[1], bgcolor[2], bgcolor[3])
  203. end
  204. end
  205. end
  206. end
  207. return false -- Continue search
  208. end)
  209. end
  210. end
  211. if ret then
  212. transp_rate = transp_len / (normal_len + transp_len)
  213. if transp_rate > 0.1 then
  214. if transp_rate > 0.5 or transp_rate ~= transp_rate then
  215. transp_rate = 0.5
  216. end
  217. task:insert_result('R_WHITE_ON_WHITE', (transp_rate * 2.0), arg)
  218. end
  219. end
  220. if invisible_blocks > 0 then
  221. if invisible_blocks > 10 then
  222. invisible_blocks = 10
  223. end
  224. local rates = { -- From 1 to 10
  225. 0.05,
  226. 0.1,
  227. 0.2,
  228. 0.3,
  229. 0.4,
  230. 0.5,
  231. 0.6,
  232. 0.7,
  233. 0.8,
  234. 1.0,
  235. }
  236. task:insert_result('MANY_INVISIBLE_PARTS', rates[invisible_blocks],
  237. tostring(invisible_blocks))
  238. end
  239. if zero_size_blocks > 0 then
  240. if zero_size_blocks > 5 then
  241. if zero_size_blocks > 10 then
  242. -- Full score
  243. task:insert_result('ZERO_FONT', 1.0,
  244. tostring(zero_size_blocks))
  245. else
  246. zero_size_blocks = 5
  247. end
  248. end
  249. if zero_size_blocks <= 5 then
  250. local rates = { -- From 1 to 5
  251. 0.1,
  252. 0.2,
  253. 0.2,
  254. 0.3,
  255. 0.5,
  256. }
  257. task:insert_result('ZERO_FONT', rates[zero_size_blocks],
  258. tostring(zero_size_blocks))
  259. end
  260. end
  261. end,
  262. }
  263. rspamd_config:register_symbol{
  264. type = 'virtual',
  265. parent = vis_check_id,
  266. name = 'R_WHITE_ON_WHITE',
  267. description = 'Message contains low contrast text',
  268. score = 4.0,
  269. group = 'html',
  270. one_shot = true,
  271. }
  272. rspamd_config:register_symbol{
  273. type = 'virtual',
  274. parent = vis_check_id,
  275. name = 'ZERO_FONT',
  276. description = 'Zero sized font used',
  277. score = 1.0, -- Reached if more than 5 elements have zero size
  278. one_shot = true,
  279. group = 'html'
  280. }
  281. rspamd_config:register_symbol{
  282. type = 'virtual',
  283. parent = vis_check_id,
  284. name = 'MANY_INVISIBLE_PARTS',
  285. description = 'Many parts are visually hidden',
  286. score = 1.0, -- Reached if more than 10 elements are hidden
  287. one_shot = true,
  288. group = 'html'
  289. }
  290. rspamd_config.EXT_CSS = {
  291. callback = function(task)
  292. local regexp_lib = require "rspamd_regexp"
  293. local re = regexp_lib.create_cached('/^.*\\.css(?:[?#].*)?$/i')
  294. local tp = task:get_text_parts() -- get text parts in a message
  295. local ret = false
  296. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  297. if p:is_html() and p:get_html() then -- if the current part is html part
  298. local hc = p:get_html() -- we get HTML context
  299. hc:foreach_tag({'link'}, function(tag)
  300. local bl = tag:get_extra()
  301. if bl then
  302. local s = tostring(bl)
  303. if s and re:match(s) then
  304. ret = true
  305. end
  306. end
  307. return ret -- Continue search
  308. end)
  309. end
  310. end
  311. return ret
  312. end,
  313. score = 1.0,
  314. group = 'html',
  315. description = 'Message contains external CSS reference'
  316. }
  317. rspamd_config.HTTP_TO_HTTPS = {
  318. callback = function(task)
  319. local tp = task:get_text_parts()
  320. if (not tp) then return false end
  321. for _,p in ipairs(tp) do
  322. if p:is_html() then
  323. local hc = p:get_html()
  324. if (not hc) then return false end
  325. local found = false
  326. hc:foreach_tag('a', function (tag, length)
  327. -- Skip this loop if we already have a match
  328. if (found) then return true end
  329. local c = tag:get_content()
  330. if (c) then
  331. c = tostring(c):lower()
  332. if (not c:match('^http')) then return false end
  333. local u = tag:get_extra()
  334. if (not u) then return false end
  335. u = tostring(u):lower()
  336. if (not u:match('^http')) then return false end
  337. if ((c:match('^http:') and u:match('^https:')) or
  338. (c:match('^https:') and u:match('^http:')))
  339. then
  340. found = true
  341. return true
  342. end
  343. end
  344. return false
  345. end)
  346. if (found) then return true end
  347. return false
  348. end
  349. end
  350. return false
  351. end,
  352. description = 'Anchor text contains different scheme to target URL',
  353. score = 2.0,
  354. group = 'html'
  355. }
  356. rspamd_config.HTTP_TO_IP = {
  357. callback = function(task)
  358. local tp = task:get_text_parts()
  359. if (not tp) then return false end
  360. for _,p in ipairs(tp) do
  361. if p:is_html() then
  362. local hc = p:get_html()
  363. if (not hc) then return false end
  364. local found = false
  365. hc:foreach_tag('a', function (tag, length)
  366. if (found) then return true end
  367. local u = tag:get_extra()
  368. if (u) then
  369. u = tostring(u):lower()
  370. if (u:match('^https?://%d+%.%d+%.%d+%.%d+')) then
  371. found = true
  372. end
  373. end
  374. return false
  375. end)
  376. if found then return true end
  377. return false
  378. end
  379. end
  380. end,
  381. description = 'Anchor points to an IP address',
  382. score = 1.0,
  383. group = 'html'
  384. }