選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

html.lua 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. -- Licensed to the Apache Software Foundation (ASF) under one or more
  2. -- contributor license agreements. See the NOTICE file distributed with
  3. -- this work for additional information regarding copyright ownership.
  4. -- The ASF licenses this file to you under the Apache License, Version 2.0
  5. -- (the "License"); you may not use this file except in compliance with
  6. -- the License. You may obtain a copy of the License at:
  7. --
  8. -- http://www.apache.org/licenses/LICENSE-2.0
  9. --
  10. -- Unless required by applicable law or agreed to in writing, software
  11. -- distributed under the License is distributed on an "AS IS" BASIS,
  12. -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. -- See the License for the specific language governing permissions and
  14. -- limitations under the License.
  15. local reconf = config['regexp']
  16. -- Messages that have only HTML part
  17. reconf['MIME_HTML_ONLY'] = {
  18. re = 'has_only_html_part()',
  19. score = 0.2,
  20. description = 'Messages that have only HTML part',
  21. group = 'headers'
  22. }
  23. local function check_html_image(task, min, max)
  24. local tp = task:get_text_parts()
  25. for _,p in ipairs(tp) do
  26. if p:is_html() then
  27. local hc = p:get_html()
  28. local len = p:get_length()
  29. if hc and len >= min and len < max then
  30. local images = hc:get_images()
  31. if images then
  32. for _,i in ipairs(images) do
  33. local tag = i['tag']
  34. if tag then
  35. local parent = tag:get_parent()
  36. if parent then
  37. if parent:get_type() == 'a' then
  38. -- do not trigger on small and unknown size images
  39. if i['height'] + i['width'] >= 210 or not i['embedded'] then
  40. return true
  41. end
  42. end
  43. end
  44. end
  45. end
  46. end
  47. end
  48. end
  49. end
  50. end
  51. rspamd_config.HTML_SHORT_LINK_IMG_1 = {
  52. callback = function(task)
  53. return check_html_image(task, 0, 1024)
  54. end,
  55. score = 2.0,
  56. group = 'html',
  57. description = 'Short html part (0..1K) with a link to an image'
  58. }
  59. rspamd_config.HTML_SHORT_LINK_IMG_2 = {
  60. callback = function(task)
  61. return check_html_image(task, 1024, 1536)
  62. end,
  63. score = 1.0,
  64. group = 'html',
  65. description = 'Short html part (1K..1.5K) with a link to an image'
  66. }
  67. rspamd_config.HTML_SHORT_LINK_IMG_3 = {
  68. callback = function(task)
  69. return check_html_image(task, 1536, 2048)
  70. end,
  71. score = 0.5,
  72. group = 'html',
  73. description = 'Short html part (1.5K..2K) with a link to an image'
  74. }
  75. rspamd_config.R_EMPTY_IMAGE = {
  76. callback = function(task)
  77. local tp = task:get_text_parts() -- get text parts in a message
  78. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  79. if p:is_html() then -- if the current part is html part
  80. local hc = p:get_html() -- we get HTML context
  81. local len = p:get_length() -- and part's length
  82. if hc and len < 50 then -- if we have a part that has less than 50 bytes of text
  83. local images = hc:get_images() -- then we check for HTML images
  84. if images then -- if there are images
  85. for _,i in ipairs(images) do -- then iterate over images in the part
  86. if i['height'] + i['width'] >= 400 then -- if we have a large image
  87. local tag = i['tag']
  88. if tag then
  89. local parent = tag:get_parent()
  90. if parent then
  91. if parent:get_type() ~= 'a' then
  92. return true
  93. end
  94. end
  95. end
  96. end
  97. end
  98. end
  99. end
  100. end
  101. end
  102. end,
  103. score = 2.0,
  104. group = 'html',
  105. description = 'Message contains empty parts and image'
  106. }
  107. rspamd_config.R_SUSPICIOUS_IMAGES = {
  108. callback = function(task)
  109. local tp = task:get_text_parts() -- get text parts in a message
  110. for _, p in ipairs(tp) do
  111. local h = p:get_html()
  112. if h then
  113. local l = p:get_words_count()
  114. local img = h:get_images()
  115. local pic_words = 0
  116. if img then
  117. for _, i in ipairs(img) do
  118. local dim = i['width'] + i['height']
  119. local tag = i['tag']
  120. if tag then
  121. local parent = tag:get_parent()
  122. if parent then
  123. if parent:get_type() == 'a' then
  124. -- do not trigger on small and large images
  125. if dim > 100 and dim < 3000 then
  126. -- We assume that a single picture 100x200 contains approx 3 words of text
  127. pic_words = pic_words + dim / 100
  128. end
  129. end
  130. end
  131. end
  132. end
  133. end
  134. if l + pic_words > 0 then
  135. local rel = pic_words / (l + pic_words)
  136. if rel > 0.5 then
  137. return true, (rel - 0.5) * 2
  138. end
  139. end
  140. end
  141. end
  142. return false
  143. end,
  144. score = 5.0,
  145. group = 'html',
  146. description = 'Message contains many suspicious messages'
  147. }
  148. local vis_check_id = rspamd_config:register_symbol{
  149. name = 'HTML_VISIBLE_CHECKS',
  150. type = 'callback',
  151. group = 'html',
  152. callback = function(task)
  153. --local logger = require "rspamd_logger"
  154. local tp = task:get_text_parts() -- get text parts in a message
  155. local ret = false
  156. local diff = 0.0
  157. local transp_rate = 0
  158. local invisible_blocks = 0
  159. local zero_size_blocks = 0
  160. local arg
  161. local normal_len = 0
  162. local transp_len = 0
  163. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  164. normal_len = normal_len + p:get_length()
  165. if p:is_html() and p:get_html() then -- if the current part is html part
  166. local hc = p:get_html() -- we get HTML context
  167. hc:foreach_tag({'font', 'span', 'div', 'p', 'td'}, function(tag)
  168. local bl = tag:get_extra()
  169. if bl then
  170. if not bl['visible'] then
  171. invisible_blocks = invisible_blocks + 1
  172. end
  173. if bl['font_size'] and bl['font_size'] == 0 then
  174. zero_size_blocks = zero_size_blocks + 1
  175. end
  176. if bl['bgcolor'] and bl['color'] and bl['visible'] then
  177. local color = bl['color']
  178. local bgcolor = bl['bgcolor']
  179. -- Should use visual approach here some day
  180. local diff_r = math.abs(color[1] - bgcolor[1])
  181. local diff_g = math.abs(color[2] - bgcolor[2])
  182. local diff_b = math.abs(color[3] - bgcolor[3])
  183. local r_avg = (color[1] + bgcolor[1]) / 2.0
  184. -- Square
  185. diff_r = diff_r * diff_r
  186. diff_g = diff_g * diff_g
  187. diff_b = diff_b * diff_b
  188. diff = math.sqrt(2*diff_r + 4*diff_g + 3 * diff_b +
  189. (r_avg * (diff_r - diff_b) / 256.0))
  190. diff = diff / 256.0
  191. if diff < 0.1 then
  192. ret = true
  193. local content_len = #(tag:get_content() or {})
  194. invisible_blocks = invisible_blocks + 1 -- This block is invisible
  195. transp_len = transp_len + content_len * (0.1 - diff) * 10.0
  196. normal_len = normal_len - content_len
  197. local tr = transp_len / (normal_len + transp_len)
  198. if tr > transp_rate then
  199. transp_rate = tr
  200. arg = string.format('%s color #%x%x%x bgcolor #%x%x%x',
  201. tostring(tag:get_type()),
  202. color[1], color[2], color[3],
  203. bgcolor[1], bgcolor[2], bgcolor[3])
  204. end
  205. end
  206. end
  207. end
  208. return false -- Continue search
  209. end)
  210. end
  211. end
  212. if ret then
  213. transp_rate = transp_len / (normal_len + transp_len)
  214. if transp_rate > 0.1 then
  215. if transp_rate > 0.5 or transp_rate ~= transp_rate then
  216. transp_rate = 0.5
  217. end
  218. task:insert_result('R_WHITE_ON_WHITE', (transp_rate * 2.0), arg)
  219. end
  220. end
  221. if invisible_blocks > 0 then
  222. if invisible_blocks > 10 then
  223. invisible_blocks = 10
  224. end
  225. local rates = { -- From 1 to 10
  226. 0.05,
  227. 0.1,
  228. 0.2,
  229. 0.3,
  230. 0.4,
  231. 0.5,
  232. 0.6,
  233. 0.7,
  234. 0.8,
  235. 1.0,
  236. }
  237. task:insert_result('MANY_INVISIBLE_PARTS', rates[invisible_blocks],
  238. tostring(invisible_blocks))
  239. end
  240. if zero_size_blocks > 0 then
  241. if zero_size_blocks > 5 then
  242. if zero_size_blocks > 10 then
  243. -- Full score
  244. task:insert_result('ZERO_FONT', 1.0,
  245. tostring(zero_size_blocks))
  246. else
  247. zero_size_blocks = 5
  248. end
  249. end
  250. if zero_size_blocks <= 5 then
  251. local rates = { -- From 1 to 5
  252. 0.1,
  253. 0.2,
  254. 0.2,
  255. 0.3,
  256. 0.5,
  257. }
  258. task:insert_result('ZERO_FONT', rates[zero_size_blocks],
  259. tostring(zero_size_blocks))
  260. end
  261. end
  262. end,
  263. }
  264. rspamd_config:register_symbol{
  265. type = 'virtual',
  266. parent = vis_check_id,
  267. name = 'R_WHITE_ON_WHITE',
  268. description = 'Message contains low contrast text',
  269. score = 4.0,
  270. group = 'html',
  271. one_shot = true,
  272. }
  273. rspamd_config:register_symbol{
  274. type = 'virtual',
  275. parent = vis_check_id,
  276. name = 'ZERO_FONT',
  277. description = 'Zero sized font used',
  278. score = 1.0, -- Reached if more than 5 elements have zero size
  279. one_shot = true,
  280. group = 'html'
  281. }
  282. rspamd_config:register_symbol{
  283. type = 'virtual',
  284. parent = vis_check_id,
  285. name = 'MANY_INVISIBLE_PARTS',
  286. description = 'Many parts are visually hidden',
  287. score = 1.0, -- Reached if more than 10 elements are hidden
  288. one_shot = true,
  289. group = 'html'
  290. }
  291. rspamd_config.EXT_CSS = {
  292. callback = function(task)
  293. local regexp_lib = require "rspamd_regexp"
  294. local re = regexp_lib.create_cached('/^.*\\.css(?:[?#].*)?$/i')
  295. local tp = task:get_text_parts() -- get text parts in a message
  296. local ret = false
  297. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  298. if p:is_html() and p:get_html() then -- if the current part is html part
  299. local hc = p:get_html() -- we get HTML context
  300. hc:foreach_tag({'link'}, function(tag)
  301. local bl = tag:get_extra()
  302. if bl then
  303. local s = tostring(bl)
  304. if s and re:match(s) then
  305. ret = true
  306. end
  307. end
  308. return ret -- Continue search
  309. end)
  310. end
  311. end
  312. return ret
  313. end,
  314. score = 1.0,
  315. group = 'html',
  316. description = 'Message contains external CSS reference'
  317. }
  318. rspamd_config.HTTP_TO_HTTPS = {
  319. callback = function(task)
  320. local tp = task:get_text_parts()
  321. if (not tp) then return false end
  322. for _,p in ipairs(tp) do
  323. if p:is_html() then
  324. local hc = p:get_html()
  325. if (not hc) then return false end
  326. local found = false
  327. hc:foreach_tag('a', function (tag, length)
  328. -- Skip this loop if we already have a match
  329. if (found) then return true end
  330. local c = tag:get_content()
  331. if (c) then
  332. c = tostring(c):lower()
  333. if (not c:match('^http')) then return false end
  334. local u = tag:get_extra()
  335. if (not u) then return false end
  336. u = tostring(u):lower()
  337. if (not u:match('^http')) then return false end
  338. if ((c:match('^http:') and u:match('^https:')) or
  339. (c:match('^https:') and u:match('^http:')))
  340. then
  341. found = true
  342. return true
  343. end
  344. end
  345. return false
  346. end)
  347. if (found) then return true end
  348. return false
  349. end
  350. end
  351. return false
  352. end,
  353. description = 'Anchor text contains different scheme to target URL',
  354. score = 2.0,
  355. group = 'html'
  356. }
  357. rspamd_config.HTTP_TO_IP = {
  358. callback = function(task)
  359. local tp = task:get_text_parts()
  360. if (not tp) then return false end
  361. for _,p in ipairs(tp) do
  362. if p:is_html() then
  363. local hc = p:get_html()
  364. if (not hc) then return false end
  365. local found = false
  366. hc:foreach_tag('a', function (tag, length)
  367. if (found) then return true end
  368. local u = tag:get_extra()
  369. if (u) then
  370. u = tostring(u):lower()
  371. if (u:match('^https?://%d+%.%d+%.%d+%.%d+')) then
  372. found = true
  373. end
  374. end
  375. return false
  376. end)
  377. if found then return true end
  378. return false
  379. end
  380. end
  381. end,
  382. description = 'Anchor points to an IP address',
  383. score = 1.0,
  384. group = 'html'
  385. }