You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.lua 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. -- Licensed to the Apache Software Foundation (ASF) under one or more
  2. -- contributor license agreements. See the NOTICE file distributed with
  3. -- this work for additional information regarding copyright ownership.
  4. -- The ASF licenses this file to you under the Apache License, Version 2.0
  5. -- (the "License"); you may not use this file except in compliance with
  6. -- the License. You may obtain a copy of the License at:
  7. --
  8. -- http://www.apache.org/licenses/LICENSE-2.0
  9. --
  10. -- Unless required by applicable law or agreed to in writing, software
  11. -- distributed under the License is distributed on an "AS IS" BASIS,
  12. -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. -- See the License for the specific language governing permissions and
  14. -- limitations under the License.
  15. local reconf = config['regexp']
  16. -- Messages that have only HTML part
  17. reconf['MIME_HTML_ONLY'] = {
  18. re = 'has_only_html_part()',
  19. score = 0.2,
  20. description = 'Messages that have only HTML part',
  21. group = 'headers'
  22. }
  23. local function has_anchor_parent(tag)
  24. local parent = tag
  25. repeat
  26. parent = parent:get_parent()
  27. if parent then
  28. if parent:get_type() == 'a' then
  29. return true
  30. end
  31. end
  32. until not parent
  33. return false
  34. end
  35. local function check_html_image(task, min, max)
  36. local tp = task:get_text_parts()
  37. for _,p in ipairs(tp) do
  38. if p:is_html() then
  39. local hc = p:get_html()
  40. local len = p:get_length()
  41. if hc and len >= min and len < max then
  42. local images = hc:get_images()
  43. if images then
  44. for _,i in ipairs(images) do
  45. local tag = i['tag']
  46. if tag then
  47. if has_anchor_parent(tag) then
  48. -- do not trigger on small and unknown size images
  49. if i['height'] + i['width'] >= 210 or not i['embedded'] then
  50. return true
  51. end
  52. end
  53. end
  54. end
  55. end
  56. end
  57. end
  58. end
  59. end
  60. rspamd_config.HTML_SHORT_LINK_IMG_1 = {
  61. callback = function(task)
  62. return check_html_image(task, 0, 1024)
  63. end,
  64. score = 2.0,
  65. group = 'html',
  66. description = 'Short html part (0..1K) with a link to an image'
  67. }
  68. rspamd_config.HTML_SHORT_LINK_IMG_2 = {
  69. callback = function(task)
  70. return check_html_image(task, 1024, 1536)
  71. end,
  72. score = 1.0,
  73. group = 'html',
  74. description = 'Short html part (1K..1.5K) with a link to an image'
  75. }
  76. rspamd_config.HTML_SHORT_LINK_IMG_3 = {
  77. callback = function(task)
  78. return check_html_image(task, 1536, 2048)
  79. end,
  80. score = 0.5,
  81. group = 'html',
  82. description = 'Short html part (1.5K..2K) with a link to an image'
  83. }
  84. rspamd_config.R_EMPTY_IMAGE = {
  85. callback = function(task)
  86. local tp = task:get_text_parts() -- get text parts in a message
  87. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  88. if p:is_html() then -- if the current part is html part
  89. local hc = p:get_html() -- we get HTML context
  90. local len = p:get_length() -- and part's length
  91. if hc and len < 50 then -- if we have a part that has less than 50 bytes of text
  92. local images = hc:get_images() -- then we check for HTML images
  93. if images then -- if there are images
  94. for _,i in ipairs(images) do -- then iterate over images in the part
  95. if i['height'] + i['width'] >= 400 then -- if we have a large image
  96. local tag = i['tag']
  97. if tag then
  98. if not has_anchor_parent(tag) then
  99. return true
  100. end
  101. end
  102. end
  103. end
  104. end
  105. end
  106. end
  107. end
  108. end,
  109. score = 2.0,
  110. group = 'html',
  111. description = 'Message contains empty parts and image'
  112. }
  113. rspamd_config.R_SUSPICIOUS_IMAGES = {
  114. callback = function(task)
  115. local tp = task:get_text_parts() -- get text parts in a message
  116. for _, p in ipairs(tp) do
  117. local h = p:get_html()
  118. if h then
  119. local l = p:get_words_count()
  120. local img = h:get_images()
  121. local pic_words = 0
  122. if img then
  123. for _, i in ipairs(img) do
  124. local dim = i['width'] + i['height']
  125. local tag = i['tag']
  126. if tag then
  127. if has_anchor_parent(tag) then
  128. if dim > 100 and dim < 3000 then
  129. -- We assume that a single picture 100x200 contains approx 3 words of text
  130. pic_words = pic_words + dim / 100
  131. end
  132. end
  133. end
  134. end
  135. end
  136. if l + pic_words > 0 then
  137. local rel = pic_words / (l + pic_words)
  138. if rel > 0.5 then
  139. return true, (rel - 0.5) * 2
  140. end
  141. end
  142. end
  143. end
  144. return false
  145. end,
  146. score = 5.0,
  147. group = 'html',
  148. description = 'Message contains many suspicious messages'
  149. }
  150. local vis_check_id = rspamd_config:register_symbol{
  151. name = 'HTML_VISIBLE_CHECKS',
  152. type = 'callback',
  153. group = 'html',
  154. callback = function(task)
  155. --local logger = require "rspamd_logger"
  156. local tp = task:get_text_parts() -- get text parts in a message
  157. local ret = false
  158. local diff = 0.0
  159. local transp_rate = 0
  160. local invisible_blocks = 0
  161. local zero_size_blocks = 0
  162. local arg
  163. local normal_len = 0
  164. local transp_len = 0
  165. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  166. normal_len = normal_len + p:get_length()
  167. if p:is_html() and p:get_html() then -- if the current part is html part
  168. local hc = p:get_html() -- we get HTML context
  169. hc:foreach_tag({'font', 'span', 'div', 'p', 'td'}, function(tag)
  170. local bl = tag:get_extra()
  171. if bl then
  172. if not bl['visible'] then
  173. invisible_blocks = invisible_blocks + 1
  174. end
  175. if bl['font_size'] and bl['font_size'] == 0 then
  176. zero_size_blocks = zero_size_blocks + 1
  177. end
  178. if bl['bgcolor'] and bl['color'] and bl['visible'] then
  179. local color = bl['color']
  180. local bgcolor = bl['bgcolor']
  181. -- Should use visual approach here some day
  182. local diff_r = math.abs(color[1] - bgcolor[1])
  183. local diff_g = math.abs(color[2] - bgcolor[2])
  184. local diff_b = math.abs(color[3] - bgcolor[3])
  185. local r_avg = (color[1] + bgcolor[1]) / 2.0
  186. -- Square
  187. diff_r = diff_r * diff_r
  188. diff_g = diff_g * diff_g
  189. diff_b = diff_b * diff_b
  190. diff = math.sqrt(2*diff_r + 4*diff_g + 3 * diff_b +
  191. (r_avg * (diff_r - diff_b) / 256.0))
  192. diff = diff / 256.0
  193. if diff < 0.1 then
  194. ret = true
  195. local content_len = #(tag:get_content() or {})
  196. invisible_blocks = invisible_blocks + 1 -- This block is invisible
  197. transp_len = transp_len + content_len * (0.1 - diff) * 10.0
  198. normal_len = normal_len - content_len
  199. local tr = transp_len / (normal_len + transp_len)
  200. if tr > transp_rate then
  201. transp_rate = tr
  202. arg = string.format('%s color #%x%x%x bgcolor #%x%x%x',
  203. tostring(tag:get_type()),
  204. color[1], color[2], color[3],
  205. bgcolor[1], bgcolor[2], bgcolor[3])
  206. end
  207. end
  208. end
  209. end
  210. return false -- Continue search
  211. end)
  212. end
  213. end
  214. if ret then
  215. transp_rate = transp_len / (normal_len + transp_len)
  216. if transp_rate > 0.1 then
  217. if transp_rate > 0.5 or transp_rate ~= transp_rate then
  218. transp_rate = 0.5
  219. end
  220. task:insert_result('R_WHITE_ON_WHITE', (transp_rate * 2.0), arg)
  221. end
  222. end
  223. if invisible_blocks > 0 then
  224. if invisible_blocks > 10 then
  225. invisible_blocks = 10
  226. end
  227. local rates = { -- From 1 to 10
  228. 0.05,
  229. 0.1,
  230. 0.2,
  231. 0.3,
  232. 0.4,
  233. 0.5,
  234. 0.6,
  235. 0.7,
  236. 0.8,
  237. 1.0,
  238. }
  239. task:insert_result('MANY_INVISIBLE_PARTS', rates[invisible_blocks],
  240. tostring(invisible_blocks))
  241. end
  242. if zero_size_blocks > 0 then
  243. if zero_size_blocks > 5 then
  244. if zero_size_blocks > 10 then
  245. -- Full score
  246. task:insert_result('ZERO_FONT', 1.0,
  247. tostring(zero_size_blocks))
  248. else
  249. zero_size_blocks = 5
  250. end
  251. end
  252. if zero_size_blocks <= 5 then
  253. local rates = { -- From 1 to 5
  254. 0.1,
  255. 0.2,
  256. 0.2,
  257. 0.3,
  258. 0.5,
  259. }
  260. task:insert_result('ZERO_FONT', rates[zero_size_blocks],
  261. tostring(zero_size_blocks))
  262. end
  263. end
  264. end,
  265. }
  266. rspamd_config:register_symbol{
  267. type = 'virtual',
  268. parent = vis_check_id,
  269. name = 'R_WHITE_ON_WHITE',
  270. description = 'Message contains low contrast text',
  271. score = 4.0,
  272. group = 'html',
  273. one_shot = true,
  274. }
  275. rspamd_config:register_symbol{
  276. type = 'virtual',
  277. parent = vis_check_id,
  278. name = 'ZERO_FONT',
  279. description = 'Zero sized font used',
  280. score = 1.0, -- Reached if more than 5 elements have zero size
  281. one_shot = true,
  282. group = 'html'
  283. }
  284. rspamd_config:register_symbol{
  285. type = 'virtual',
  286. parent = vis_check_id,
  287. name = 'MANY_INVISIBLE_PARTS',
  288. description = 'Many parts are visually hidden',
  289. score = 1.0, -- Reached if more than 10 elements are hidden
  290. one_shot = true,
  291. group = 'html'
  292. }
  293. rspamd_config.EXT_CSS = {
  294. callback = function(task)
  295. local regexp_lib = require "rspamd_regexp"
  296. local re = regexp_lib.create_cached('/^.*\\.css(?:[?#].*)?$/i')
  297. local tp = task:get_text_parts() -- get text parts in a message
  298. local ret = false
  299. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  300. if p:is_html() and p:get_html() then -- if the current part is html part
  301. local hc = p:get_html() -- we get HTML context
  302. hc:foreach_tag({'link'}, function(tag)
  303. local bl = tag:get_extra()
  304. if bl then
  305. local s = tostring(bl)
  306. if s and re:match(s) then
  307. ret = true
  308. end
  309. end
  310. return ret -- Continue search
  311. end)
  312. end
  313. end
  314. return ret
  315. end,
  316. score = 1.0,
  317. group = 'html',
  318. description = 'Message contains external CSS reference'
  319. }
  320. rspamd_config.HTTP_TO_HTTPS = {
  321. callback = function(task)
  322. local tp = task:get_text_parts()
  323. if (not tp) then return false end
  324. for _,p in ipairs(tp) do
  325. if p:is_html() then
  326. local hc = p:get_html()
  327. if (not hc) then return false end
  328. local found = false
  329. hc:foreach_tag('a', function (tag, length)
  330. -- Skip this loop if we already have a match
  331. if (found) then return true end
  332. local c = tag:get_content()
  333. if (c) then
  334. c = tostring(c):lower()
  335. if (not c:match('^http')) then return false end
  336. local u = tag:get_extra()
  337. if (not u) then return false end
  338. u = tostring(u):lower()
  339. if (not u:match('^http')) then return false end
  340. if ((c:match('^http:') and u:match('^https:')) or
  341. (c:match('^https:') and u:match('^http:')))
  342. then
  343. found = true
  344. return true
  345. end
  346. end
  347. return false
  348. end)
  349. if (found) then return true end
  350. return false
  351. end
  352. end
  353. return false
  354. end,
  355. description = 'Anchor text contains different scheme to target URL',
  356. score = 2.0,
  357. group = 'html'
  358. }
  359. rspamd_config.HTTP_TO_IP = {
  360. callback = function(task)
  361. local tp = task:get_text_parts()
  362. if (not tp) then return false end
  363. for _,p in ipairs(tp) do
  364. if p:is_html() then
  365. local hc = p:get_html()
  366. if (not hc) then return false end
  367. local found = false
  368. hc:foreach_tag('a', function (tag, length)
  369. if (found) then return true end
  370. local u = tag:get_extra()
  371. if (u) then
  372. u = tostring(u):lower()
  373. if (u:match('^https?://%d+%.%d+%.%d+%.%d+')) then
  374. found = true
  375. end
  376. end
  377. return false
  378. end)
  379. if found then return true end
  380. return false
  381. end
  382. end
  383. end,
  384. description = 'Anchor points to an IP address',
  385. score = 1.0,
  386. group = 'html'
  387. }