You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.lua 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. -- Licensed to the Apache Software Foundation (ASF) under one or more
  2. -- contributor license agreements. See the NOTICE file distributed with
  3. -- this work for additional information regarding copyright ownership.
  4. -- The ASF licenses this file to you under the Apache License, Version 2.0
  5. -- (the "License"); you may not use this file except in compliance with
  6. -- the License. You may obtain a copy of the License at:
  7. --
  8. -- http://www.apache.org/licenses/LICENSE-2.0
  9. --
  10. -- Unless required by applicable law or agreed to in writing, software
  11. -- distributed under the License is distributed on an "AS IS" BASIS,
  12. -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. -- See the License for the specific language governing permissions and
  14. -- limitations under the License.
  15. local reconf = config['regexp']
  16. -- Messages that have only HTML part
  17. reconf['MIME_HTML_ONLY'] = {
  18. re = 'has_only_html_part()',
  19. score = 0.2,
  20. description = 'Messages that have only HTML part',
  21. group = 'headers'
  22. }
  23. local function has_anchor_parent(tag)
  24. local parent = tag
  25. repeat
  26. parent = parent:get_parent()
  27. if parent then
  28. if parent:get_type() == 'a' then
  29. return true
  30. end
  31. end
  32. until not parent
  33. return false
  34. end
  35. local function check_html_image(task, min, max)
  36. local tp = task:get_text_parts()
  37. for _,p in ipairs(tp) do
  38. if p:is_html() then
  39. local hc = p:get_html()
  40. local len = p:get_length()
  41. if hc and len >= min and len < max then
  42. local images = hc:get_images()
  43. if images then
  44. for _,i in ipairs(images) do
  45. local tag = i['tag']
  46. if tag then
  47. if has_anchor_parent(tag) then
  48. -- do not trigger on small and unknown size images
  49. if i['height'] + i['width'] >= 210 or not i['embedded'] then
  50. return true
  51. end
  52. end
  53. end
  54. end
  55. end
  56. end
  57. end
  58. end
  59. end
  60. rspamd_config.HTML_SHORT_LINK_IMG_1 = {
  61. callback = function(task)
  62. return check_html_image(task, 0, 1024)
  63. end,
  64. score = 2.0,
  65. group = 'html',
  66. description = 'Short html part (0..1K) with a link to an image'
  67. }
  68. rspamd_config.HTML_SHORT_LINK_IMG_2 = {
  69. callback = function(task)
  70. return check_html_image(task, 1024, 1536)
  71. end,
  72. score = 1.0,
  73. group = 'html',
  74. description = 'Short html part (1K..1.5K) with a link to an image'
  75. }
  76. rspamd_config.HTML_SHORT_LINK_IMG_3 = {
  77. callback = function(task)
  78. return check_html_image(task, 1536, 2048)
  79. end,
  80. score = 0.5,
  81. group = 'html',
  82. description = 'Short html part (1.5K..2K) with a link to an image'
  83. }
  84. rspamd_config.R_EMPTY_IMAGE = {
  85. callback = function(task)
  86. local tp = task:get_text_parts() -- get text parts in a message
  87. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  88. if p:is_html() then -- if the current part is html part
  89. local hc = p:get_html() -- we get HTML context
  90. local len = p:get_length() -- and part's length
  91. if hc and len < 50 then -- if we have a part that has less than 50 bytes of text
  92. local images = hc:get_images() -- then we check for HTML images
  93. if images then -- if there are images
  94. for _,i in ipairs(images) do -- then iterate over images in the part
  95. if i['height'] + i['width'] >= 400 then -- if we have a large image
  96. local tag = i['tag']
  97. if tag then
  98. if not has_anchor_parent(tag) then
  99. return true
  100. end
  101. end
  102. end
  103. end
  104. end
  105. end
  106. end
  107. end
  108. end,
  109. score = 2.0,
  110. group = 'html',
  111. description = 'Message contains empty parts and image'
  112. }
  113. rspamd_config.R_SUSPICIOUS_IMAGES = {
  114. callback = function(task)
  115. local tp = task:get_text_parts() -- get text parts in a message
  116. for _, p in ipairs(tp) do
  117. local h = p:get_html()
  118. if h then
  119. local l = p:get_words_count()
  120. local img = h:get_images()
  121. local pic_words = 0
  122. if img then
  123. for _, i in ipairs(img) do
  124. local dim = i['width'] + i['height']
  125. local tag = i['tag']
  126. if tag then
  127. if has_anchor_parent(tag) then
  128. if dim > 100 and dim < 3000 then
  129. -- We assume that a single picture 100x200 contains approx 3 words of text
  130. pic_words = pic_words + dim / 100
  131. end
  132. end
  133. end
  134. end
  135. end
  136. if l + pic_words > 0 then
  137. local rel = pic_words / (l + pic_words)
  138. if rel > 0.5 then
  139. return true, (rel - 0.5) * 2
  140. end
  141. end
  142. end
  143. end
  144. return false
  145. end,
  146. score = 5.0,
  147. group = 'html',
  148. description = 'Message contains many suspicious messages'
  149. }
  150. local vis_check_id = rspamd_config:register_symbol{
  151. name = 'HTML_VISIBLE_CHECKS',
  152. type = 'callback',
  153. group = 'html',
  154. callback = function(task)
  155. --local logger = require "rspamd_logger"
  156. local tp = task:get_text_parts() -- get text parts in a message
  157. local ret = false
  158. local diff = 0.0
  159. local transp_rate = 0
  160. local invisible_blocks = 0
  161. local zero_size_blocks = 0
  162. local arg
  163. local normal_len = 0
  164. local transp_len = 0
  165. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  166. normal_len = normal_len + p:get_length()
  167. if p:is_html() and p:get_html() then -- if the current part is html part
  168. local hc = p:get_html() -- we get HTML context
  169. hc:foreach_tag({'font', 'span', 'div', 'p', 'td'}, function(tag, clen, is_leaf)
  170. local bl = tag:get_extra()
  171. if bl then
  172. if not bl['visible'] and is_leaf then
  173. invisible_blocks = invisible_blocks + 1
  174. end
  175. if bl['font_size'] and bl['font_size'] == 0 and is_leaf then
  176. zero_size_blocks = zero_size_blocks + 1
  177. end
  178. if bl['bgcolor'] and bl['color'] and bl['visible'] and is_leaf then
  179. local color = bl['color']
  180. local bgcolor = bl['bgcolor']
  181. -- Should use visual approach here some day
  182. local diff_r = math.abs(color[1] - bgcolor[1])
  183. local diff_g = math.abs(color[2] - bgcolor[2])
  184. local diff_b = math.abs(color[3] - bgcolor[3])
  185. local r_avg = (color[1] + bgcolor[1]) / 2.0
  186. -- Square
  187. diff_r = diff_r * diff_r
  188. diff_g = diff_g * diff_g
  189. diff_b = diff_b * diff_b
  190. diff = math.sqrt(2*diff_r + 4*diff_g + 3 * diff_b +
  191. (r_avg * (diff_r - diff_b) / 256.0))
  192. diff = diff / 256.0
  193. if diff < 0.1 then
  194. ret = true
  195. invisible_blocks = invisible_blocks + 1 -- This block is invisible
  196. transp_len = transp_len + clen * (0.1 - diff) * 10.0
  197. normal_len = normal_len - clen
  198. local tr = transp_len / (normal_len + transp_len)
  199. if tr > transp_rate then
  200. transp_rate = tr
  201. arg = string.format('%s color #%x%x%x bgcolor #%x%x%x',
  202. tostring(tag:get_type()),
  203. color[1], color[2], color[3],
  204. bgcolor[1], bgcolor[2], bgcolor[3])
  205. end
  206. end
  207. end
  208. end
  209. return false -- Continue search
  210. end)
  211. end
  212. end
  213. if ret then
  214. transp_rate = transp_len / (normal_len + transp_len)
  215. if transp_rate > 0.1 then
  216. if transp_rate > 0.5 or transp_rate ~= transp_rate then
  217. transp_rate = 0.5
  218. end
  219. task:insert_result('R_WHITE_ON_WHITE', (transp_rate * 2.0), arg)
  220. end
  221. end
  222. if invisible_blocks > 0 then
  223. if invisible_blocks > 10 then
  224. invisible_blocks = 10
  225. end
  226. local rates = { -- From 1 to 10
  227. 0.05,
  228. 0.1,
  229. 0.2,
  230. 0.3,
  231. 0.4,
  232. 0.5,
  233. 0.6,
  234. 0.7,
  235. 0.8,
  236. 1.0,
  237. }
  238. task:insert_result('MANY_INVISIBLE_PARTS', rates[invisible_blocks],
  239. tostring(invisible_blocks))
  240. end
  241. if zero_size_blocks > 0 then
  242. if zero_size_blocks > 5 then
  243. if zero_size_blocks > 10 then
  244. -- Full score
  245. task:insert_result('ZERO_FONT', 1.0,
  246. tostring(zero_size_blocks))
  247. else
  248. zero_size_blocks = 5
  249. end
  250. end
  251. if zero_size_blocks <= 5 then
  252. local rates = { -- From 1 to 5
  253. 0.1,
  254. 0.2,
  255. 0.2,
  256. 0.3,
  257. 0.5,
  258. }
  259. task:insert_result('ZERO_FONT', rates[zero_size_blocks],
  260. tostring(zero_size_blocks))
  261. end
  262. end
  263. end,
  264. }
  265. rspamd_config:register_symbol{
  266. type = 'virtual',
  267. parent = vis_check_id,
  268. name = 'R_WHITE_ON_WHITE',
  269. description = 'Message contains low contrast text',
  270. score = 4.0,
  271. group = 'html',
  272. one_shot = true,
  273. }
  274. rspamd_config:register_symbol{
  275. type = 'virtual',
  276. parent = vis_check_id,
  277. name = 'ZERO_FONT',
  278. description = 'Zero sized font used',
  279. score = 1.0, -- Reached if more than 5 elements have zero size
  280. one_shot = true,
  281. group = 'html'
  282. }
  283. rspamd_config:register_symbol{
  284. type = 'virtual',
  285. parent = vis_check_id,
  286. name = 'MANY_INVISIBLE_PARTS',
  287. description = 'Many parts are visually hidden',
  288. score = 1.0, -- Reached if more than 10 elements are hidden
  289. one_shot = true,
  290. group = 'html'
  291. }
  292. rspamd_config.EXT_CSS = {
  293. callback = function(task)
  294. local regexp_lib = require "rspamd_regexp"
  295. local re = regexp_lib.create_cached('/^.*\\.css(?:[?#].*)?$/i')
  296. local tp = task:get_text_parts() -- get text parts in a message
  297. local ret = false
  298. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  299. if p:is_html() and p:get_html() then -- if the current part is html part
  300. local hc = p:get_html() -- we get HTML context
  301. hc:foreach_tag({'link'}, function(tag)
  302. local bl = tag:get_extra()
  303. if bl then
  304. local s = tostring(bl)
  305. if s and re:match(s) then
  306. ret = true
  307. end
  308. end
  309. return ret -- Continue search
  310. end)
  311. end
  312. end
  313. return ret
  314. end,
  315. score = 1.0,
  316. group = 'html',
  317. description = 'Message contains external CSS reference'
  318. }
  319. rspamd_config.HTTP_TO_HTTPS = {
  320. callback = function(task)
  321. local tp = task:get_text_parts()
  322. if (not tp) then return false end
  323. for _,p in ipairs(tp) do
  324. if p:is_html() then
  325. local hc = p:get_html()
  326. if (not hc) then return false end
  327. local found = false
  328. hc:foreach_tag('a', function (tag, length)
  329. -- Skip this loop if we already have a match
  330. if (found) then return true end
  331. local c = tag:get_content()
  332. if (c) then
  333. c = tostring(c):lower()
  334. if (not c:match('^http')) then return false end
  335. local u = tag:get_extra()
  336. if (not u) then return false end
  337. u = tostring(u):lower()
  338. if (not u:match('^http')) then return false end
  339. if ((c:match('^http:') and u:match('^https:')) or
  340. (c:match('^https:') and u:match('^http:')))
  341. then
  342. found = true
  343. return true
  344. end
  345. end
  346. return false
  347. end)
  348. if (found) then return true end
  349. return false
  350. end
  351. end
  352. return false
  353. end,
  354. description = 'Anchor text contains different scheme to target URL',
  355. score = 2.0,
  356. group = 'html'
  357. }
  358. rspamd_config.HTTP_TO_IP = {
  359. callback = function(task)
  360. local tp = task:get_text_parts()
  361. if (not tp) then return false end
  362. for _,p in ipairs(tp) do
  363. if p:is_html() then
  364. local hc = p:get_html()
  365. if (not hc) then return false end
  366. local found = false
  367. hc:foreach_tag('a', function (tag, length)
  368. if (found) then return true end
  369. local u = tag:get_extra()
  370. if (u) then
  371. u = tostring(u):lower()
  372. if (u:match('^https?://%d+%.%d+%.%d+%.%d+')) then
  373. found = true
  374. end
  375. end
  376. return false
  377. end)
  378. if found then return true end
  379. return false
  380. end
  381. end
  382. end,
  383. description = 'Anchor points to an IP address',
  384. score = 1.0,
  385. group = 'html'
  386. }