You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.lua 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. -- Licensed to the Apache Software Foundation (ASF) under one or more
  2. -- contributor license agreements. See the NOTICE file distributed with
  3. -- this work for additional information regarding copyright ownership.
  4. -- The ASF licenses this file to you under the Apache License, Version 2.0
  5. -- (the "License"); you may not use this file except in compliance with
  6. -- the License. You may obtain a copy of the License at:
  7. --
  8. -- http://www.apache.org/licenses/LICENSE-2.0
  9. --
  10. -- Unless required by applicable law or agreed to in writing, software
  11. -- distributed under the License is distributed on an "AS IS" BASIS,
  12. -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. -- See the License for the specific language governing permissions and
  14. -- limitations under the License.
  15. local reconf = config['regexp']
  16. local rspamd_regexp = require "rspamd_regexp"
  17. -- Messages that have only HTML part
  18. reconf['MIME_HTML_ONLY'] = {
  19. re = 'has_only_html_part()',
  20. score = 0.2,
  21. description = 'Message has only an HTML part',
  22. group = 'headers'
  23. }
  24. local function has_anchor_parent(tag)
  25. local parent = tag
  26. repeat
  27. parent = parent:get_parent()
  28. if parent then
  29. if parent:get_type() == 'a' then
  30. return true
  31. end
  32. end
  33. until not parent
  34. return false
  35. end
  36. local function check_html_image(task, min, max)
  37. local tp = task:get_text_parts()
  38. for _, p in ipairs(tp) do
  39. if p:is_html() then
  40. local hc = p:get_html()
  41. local len = p:get_length()
  42. if hc and len >= min and len < max then
  43. local images = hc:get_images()
  44. if images then
  45. for _, i in ipairs(images) do
  46. local tag = i['tag']
  47. if tag then
  48. if has_anchor_parent(tag) then
  49. -- do not trigger on small and unknown size images
  50. if i['height'] + i['width'] >= 210 and i['embedded'] then
  51. return true
  52. end
  53. end
  54. end
  55. end
  56. end
  57. end
  58. end
  59. end
  60. end
  61. rspamd_config.HTML_SHORT_LINK_IMG_1 = {
  62. callback = function(task)
  63. return check_html_image(task, 0, 1024)
  64. end,
  65. score = 2.0,
  66. group = 'html',
  67. description = 'Short HTML part (0..1K) with a link to an image'
  68. }
  69. rspamd_config.HTML_SHORT_LINK_IMG_2 = {
  70. callback = function(task)
  71. return check_html_image(task, 1024, 1536)
  72. end,
  73. score = 1.0,
  74. group = 'html',
  75. description = 'Short HTML part (1K..1.5K) with a link to an image'
  76. }
  77. rspamd_config.HTML_SHORT_LINK_IMG_3 = {
  78. callback = function(task)
  79. return check_html_image(task, 1536, 2048)
  80. end,
  81. score = 0.5,
  82. group = 'html',
  83. description = 'Short HTML part (1.5K..2K) with a link to an image'
  84. }
  85. rspamd_config.R_EMPTY_IMAGE = {
  86. callback = function(task)
  87. local tp = task:get_text_parts() -- get text parts in a message
  88. for _, p in ipairs(tp) do
  89. -- iterate over text parts array using `ipairs`
  90. if p:is_html() then
  91. -- if the current part is html part
  92. local hc = p:get_html() -- we get HTML context
  93. local len = p:get_length() -- and part's length
  94. if hc and len < 50 then
  95. -- if we have a part that has less than 50 bytes of text
  96. local images = hc:get_images() -- then we check for HTML images
  97. if images then
  98. -- if there are images
  99. for _, i in ipairs(images) do
  100. -- then iterate over images in the part
  101. if i['height'] + i['width'] >= 400 then
  102. -- if we have a large image
  103. local tag = i['tag']
  104. if tag then
  105. if not has_anchor_parent(tag) then
  106. return true
  107. end
  108. end
  109. end
  110. end
  111. end
  112. end
  113. end
  114. end
  115. end,
  116. score = 2.0,
  117. group = 'html',
  118. description = 'Message contains empty parts and image'
  119. }
  120. rspamd_config.R_SUSPICIOUS_IMAGES = {
  121. callback = function(task)
  122. local tp = task:get_text_parts() -- get text parts in a message
  123. for _, p in ipairs(tp) do
  124. local h = p:get_html()
  125. if h then
  126. local l = p:get_words_count()
  127. local img = h:get_images()
  128. local pic_words = 0
  129. if img then
  130. for _, i in ipairs(img) do
  131. local dim = i['width'] + i['height']
  132. local tag = i['tag']
  133. if tag then
  134. if has_anchor_parent(tag) then
  135. if dim > 100 and dim < 3000 then
  136. -- We assume that a single picture 100x200 contains approx 3 words of text
  137. pic_words = pic_words + dim / 100
  138. end
  139. end
  140. end
  141. end
  142. end
  143. if l + pic_words > 0 then
  144. local rel = pic_words / (l + pic_words)
  145. if rel > 0.5 then
  146. return true, (rel - 0.5) * 2
  147. end
  148. end
  149. end
  150. end
  151. return false
  152. end,
  153. score = 5.0,
  154. group = 'html',
  155. description = 'Message contains many suspicious messages'
  156. }
  157. local vis_check_id = rspamd_config:register_symbol {
  158. name = 'HTML_VISIBLE_CHECKS',
  159. type = 'callback',
  160. group = 'html',
  161. callback = function(task)
  162. --local logger = require "rspamd_logger"
  163. local tp = task:get_text_parts() -- get text parts in a message
  164. local ret = false
  165. local transp_rate = 0
  166. local invisible_blocks = 0
  167. local zero_size_blocks = 0
  168. local arg
  169. local normal_len = 0
  170. local transp_len = 0
  171. for _, p in ipairs(tp) do
  172. -- iterate over text parts array using `ipairs`
  173. normal_len = normal_len + p:get_length()
  174. if p:is_html() and p:get_html() then
  175. -- if the current part is html part
  176. local hc = p:get_html() -- we get HTML context
  177. hc:foreach_tag({ 'font', 'span', 'div', 'p', 'td' }, function(tag, clen, is_leaf)
  178. local bl = tag:get_style()
  179. if bl then
  180. if not bl.visible and clen > 0 and is_leaf then
  181. invisible_blocks = invisible_blocks + 1
  182. end
  183. if (bl.font_size or 12) == 0 and clen > 0 and is_leaf then
  184. zero_size_blocks = zero_size_blocks + 1
  185. end
  186. if bl.transparent and is_leaf then
  187. ret = true
  188. invisible_blocks = invisible_blocks + 1 -- This block is invisible
  189. transp_len = transp_len + clen
  190. normal_len = normal_len - clen
  191. local tr = transp_len / (normal_len + transp_len)
  192. if tr > transp_rate then
  193. transp_rate = tr
  194. if not bl.color then
  195. bl.color = { 0, 0, 0 }
  196. end
  197. if not bl.bgcolor then
  198. bl.bgcolor = { 0, 0, 0 }
  199. end
  200. arg = string.format('%s color #%x%x%x bgcolor #%x%x%x',
  201. tag:get_type(),
  202. bl.color[1], bl.color[2], bl.color[3],
  203. bl.bgcolor[1], bl.bgcolor[2], bl.bgcolor[3])
  204. end
  205. end
  206. end
  207. return false -- Continue search
  208. end)
  209. end
  210. end
  211. if ret then
  212. transp_rate = transp_len / (normal_len + transp_len)
  213. if transp_rate > 0.1 then
  214. if transp_rate > 0.5 or transp_rate ~= transp_rate then
  215. transp_rate = 0.5
  216. end
  217. task:insert_result('R_WHITE_ON_WHITE', (transp_rate * 2.0), arg)
  218. end
  219. end
  220. if invisible_blocks > 0 then
  221. if invisible_blocks > 10 then
  222. invisible_blocks = 10
  223. end
  224. local rates = { -- From 1 to 10
  225. 0.05,
  226. 0.1,
  227. 0.2,
  228. 0.3,
  229. 0.4,
  230. 0.5,
  231. 0.6,
  232. 0.7,
  233. 0.8,
  234. 1.0,
  235. }
  236. task:insert_result('MANY_INVISIBLE_PARTS', rates[invisible_blocks],
  237. tostring(invisible_blocks))
  238. end
  239. if zero_size_blocks > 0 then
  240. if zero_size_blocks > 5 then
  241. if zero_size_blocks > 10 then
  242. -- Full score
  243. task:insert_result('ZERO_FONT', 1.0,
  244. tostring(zero_size_blocks))
  245. else
  246. zero_size_blocks = 5
  247. end
  248. end
  249. if zero_size_blocks <= 5 then
  250. local rates = { -- From 1 to 5
  251. 0.1,
  252. 0.2,
  253. 0.2,
  254. 0.3,
  255. 0.5,
  256. }
  257. task:insert_result('ZERO_FONT', rates[zero_size_blocks],
  258. tostring(zero_size_blocks))
  259. end
  260. end
  261. end,
  262. }
  263. rspamd_config:register_symbol {
  264. type = 'virtual',
  265. parent = vis_check_id,
  266. name = 'R_WHITE_ON_WHITE',
  267. description = 'Message contains low contrast text',
  268. score = 4.0,
  269. group = 'html',
  270. one_shot = true,
  271. }
  272. rspamd_config:register_symbol {
  273. type = 'virtual',
  274. parent = vis_check_id,
  275. name = 'ZERO_FONT',
  276. description = 'Zero sized font used',
  277. score = 1.0, -- Reached if more than 5 elements have zero size
  278. one_shot = true,
  279. group = 'html'
  280. }
  281. rspamd_config:register_symbol {
  282. type = 'virtual',
  283. parent = vis_check_id,
  284. name = 'MANY_INVISIBLE_PARTS',
  285. description = 'Many parts are visually hidden',
  286. score = 1.0, -- Reached if more than 10 elements are hidden
  287. one_shot = true,
  288. group = 'html'
  289. }
  290. rspamd_config.EXT_CSS = {
  291. callback = function(task)
  292. local regexp_lib = require "rspamd_regexp"
  293. local re = regexp_lib.create_cached('/^.*\\.css(?:[?#].*)?$/i')
  294. local tp = task:get_text_parts() -- get text parts in a message
  295. local ret = false
  296. for _, p in ipairs(tp) do
  297. -- iterate over text parts array using `ipairs`
  298. if p:is_html() and p:get_html() then
  299. -- if the current part is html part
  300. local hc = p:get_html() -- we get HTML context
  301. hc:foreach_tag({ 'link' }, function(tag)
  302. local bl = tag:get_extra()
  303. if bl then
  304. local s = tostring(bl)
  305. if s and re:match(s) then
  306. ret = true
  307. end
  308. end
  309. return ret -- Continue search
  310. end)
  311. end
  312. end
  313. return ret
  314. end,
  315. score = 1.0,
  316. group = 'html',
  317. description = 'Message contains external CSS reference'
  318. }
  319. local https_re = rspamd_regexp.create_cached('/^https:/i')
  320. rspamd_config.HTTP_TO_HTTPS = {
  321. callback = function(task)
  322. local found_opts
  323. local tp = task:get_text_parts() or {}
  324. for _, p in ipairs(tp) do
  325. if p:is_html() then
  326. local hc = p:get_html()
  327. if (not hc) then
  328. return false
  329. end
  330. local found = false
  331. hc:foreach_tag('a', function(tag, _)
  332. -- Skip this loop if we already have a match
  333. if (found) then
  334. return true
  335. end
  336. local c = tag:get_content()
  337. if (c) then
  338. if (not https_re:match(c)) then
  339. return false
  340. end
  341. local u = tag:get_extra()
  342. if (not u) then
  343. return false
  344. end
  345. local url_proto = u:get_protocol()
  346. if url_proto ~= 'http' then
  347. return false
  348. end
  349. -- Capture matches for http in href to https in visible part only
  350. found = true
  351. found_opts = u:get_host()
  352. return true
  353. end
  354. return false
  355. end)
  356. if (found) then
  357. return true, 1.0, found_opts
  358. end
  359. return false
  360. end
  361. end
  362. return false
  363. end,
  364. description = 'The anchor text contains a distinct scheme compared to the target URL',
  365. score = 0.5,
  366. group = 'html'
  367. }
  368. rspamd_config.HTTP_TO_IP = {
  369. callback = function(task)
  370. local tp = task:get_text_parts()
  371. if (not tp) then
  372. return false
  373. end
  374. for _, p in ipairs(tp) do
  375. if p:is_html() then
  376. local hc = p:get_html()
  377. if (not hc) then
  378. return false
  379. end
  380. local found = false
  381. hc:foreach_tag('a', function(tag, length)
  382. if (found) then
  383. return true
  384. end
  385. local u = tag:get_extra()
  386. if (u) then
  387. u = tostring(u):lower()
  388. if (u:match('^https?://%d+%.%d+%.%d+%.%d+')) then
  389. found = true
  390. end
  391. end
  392. return false
  393. end)
  394. if found then
  395. return true
  396. end
  397. return false
  398. end
  399. end
  400. end,
  401. description = 'HTML anchor points to an IP address',
  402. score = 1.0,
  403. group = 'html'
  404. }