You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.lua 9.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. -- Licensed to the Apache Software Foundation (ASF) under one or more
  2. -- contributor license agreements. See the NOTICE file distributed with
  3. -- this work for additional information regarding copyright ownership.
  4. -- The ASF licenses this file to you under the Apache License, Version 2.0
  5. -- (the "License"); you may not use this file except in compliance with
  6. -- the License. You may obtain a copy of the License at:
  7. --
  8. -- http://www.apache.org/licenses/LICENSE-2.0
  9. --
  10. -- Unless required by applicable law or agreed to in writing, software
  11. -- distributed under the License is distributed on an "AS IS" BASIS,
  12. -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. -- See the License for the specific language governing permissions and
  14. -- limitations under the License.
  15. local reconf = config['regexp']
  16. -- Messages that have only HTML part
  17. reconf['MIME_HTML_ONLY'] = {
  18. re = 'has_only_html_part()',
  19. score = 0.2,
  20. description = 'Messages that have only HTML part',
  21. group = 'headers'
  22. }
  23. local function check_html_image(task, min, max)
  24. local tp = task:get_text_parts()
  25. for _,p in ipairs(tp) do
  26. if p:is_html() then
  27. local hc = p:get_html()
  28. local len = p:get_length()
  29. if hc and len >= min and len < max then
  30. local images = hc:get_images()
  31. if images then
  32. for _,i in ipairs(images) do
  33. local tag = i['tag']
  34. if tag then
  35. local parent = tag:get_parent()
  36. if parent then
  37. if parent:get_type() == 'a' then
  38. -- do not trigger on small and unknown size images
  39. if i['height'] + i['width'] >= 210 or not i['embedded'] then
  40. return true
  41. end
  42. end
  43. end
  44. end
  45. end
  46. end
  47. end
  48. end
  49. end
  50. end
  51. rspamd_config.HTML_SHORT_LINK_IMG_1 = {
  52. callback = function(task)
  53. return check_html_image(task, 0, 1024)
  54. end,
  55. score = 2.0,
  56. group = 'html',
  57. description = 'Short html part (0..1K) with a link to an image'
  58. }
  59. rspamd_config.HTML_SHORT_LINK_IMG_2 = {
  60. callback = function(task)
  61. return check_html_image(task, 1024, 1536)
  62. end,
  63. score = 1.0,
  64. group = 'html',
  65. description = 'Short html part (1K..1.5K) with a link to an image'
  66. }
  67. rspamd_config.HTML_SHORT_LINK_IMG_3 = {
  68. callback = function(task)
  69. return check_html_image(task, 1536, 2048)
  70. end,
  71. score = 0.5,
  72. group = 'html',
  73. description = 'Short html part (1.5K..2K) with a link to an image'
  74. }
  75. rspamd_config.R_EMPTY_IMAGE = {
  76. callback = function(task)
  77. local tp = task:get_text_parts() -- get text parts in a message
  78. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  79. if p:is_html() then -- if the current part is html part
  80. local hc = p:get_html() -- we get HTML context
  81. local len = p:get_length() -- and part's length
  82. if hc and len < 50 then -- if we have a part that has less than 50 bytes of text
  83. local images = hc:get_images() -- then we check for HTML images
  84. if images then -- if there are images
  85. for _,i in ipairs(images) do -- then iterate over images in the part
  86. if i['height'] + i['width'] >= 400 then -- if we have a large image
  87. local tag = i['tag']
  88. if tag then
  89. local parent = tag:get_parent()
  90. if parent then
  91. if parent:get_type() ~= 'a' then
  92. return true
  93. end
  94. end
  95. end
  96. end
  97. end
  98. end
  99. end
  100. end
  101. end
  102. end,
  103. score = 2.0,
  104. group = 'html',
  105. description = 'Message contains empty parts and image'
  106. }
  107. rspamd_config.R_SUSPICIOUS_IMAGES = {
  108. callback = function(task)
  109. local tp = task:get_text_parts() -- get text parts in a message
  110. for _, p in ipairs(tp) do
  111. local h = p:get_html()
  112. if h then
  113. local l = p:get_words_count()
  114. local img = h:get_images()
  115. local pic_words = 0
  116. if img then
  117. for _, i in ipairs(img) do
  118. local dim = i['width'] + i['height']
  119. local tag = i['tag']
  120. if tag then
  121. local parent = tag:get_parent()
  122. if parent then
  123. if parent:get_type() == 'a' then
  124. -- do not trigger on small and large images
  125. if dim > 100 and dim < 3000 then
  126. -- We assume that a single picture 100x200 contains approx 3 words of text
  127. pic_words = pic_words + dim / 100
  128. end
  129. end
  130. end
  131. end
  132. end
  133. end
  134. if l + pic_words > 0 then
  135. local rel = pic_words / (l + pic_words)
  136. if rel > 0.5 then
  137. return true, (rel - 0.5) * 2
  138. end
  139. end
  140. end
  141. end
  142. return false
  143. end,
  144. score = 5.0,
  145. group = 'html',
  146. description = 'Message contains many suspicious messages'
  147. }
  148. rspamd_config.R_WHITE_ON_WHITE = {
  149. callback = function(task)
  150. local tp = task:get_text_parts() -- get text parts in a message
  151. local ret = false
  152. local diff = 0.0
  153. local transp_rate = 0
  154. local arg
  155. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  156. if p:is_html() and p:get_html() then -- if the current part is html part
  157. local normal_len = p:get_length()
  158. local transp_len = 0
  159. local hc = p:get_html() -- we get HTML context
  160. hc:foreach_tag({'font', 'span', 'div', 'p'}, function(tag)
  161. local bl = tag:get_extra()
  162. if bl then
  163. if bl['bgcolor'] and bl['color'] and bl['visible'] then
  164. local color = bl['color']
  165. local bgcolor = bl['bgcolor']
  166. -- Should use visual approach here some day
  167. local diff_r = math.abs(color[1] - bgcolor[1]) / 255.0
  168. local diff_g = math.abs(color[2] - bgcolor[2]) / 255.0
  169. local diff_b = math.abs(color[3] - bgcolor[3]) / 255.0
  170. diff = (diff_r + diff_g + diff_b) / 3.0
  171. if diff < 0.1 then
  172. ret = true
  173. transp_len = (tag:get_content_length()) *
  174. (0.1 - diff) * 5.0
  175. normal_len = normal_len - tag:get_content_length()
  176. local tr = transp_len / (normal_len + transp_len)
  177. if tr > transp_rate then
  178. transp_rate = tr
  179. arg = string.format('%s color #%x%x%x bgcolor #%x%x%x',
  180. tostring(tag:get_type()),
  181. color[1], color[2], color[3],
  182. bgcolor[1], bgcolor[2], bgcolor[3])
  183. end
  184. end
  185. end
  186. end
  187. return false -- Continue search
  188. end)
  189. end
  190. end
  191. if ret then
  192. if transp_rate > 0.1 then
  193. if transp_rate > 0.5 or transp_rate ~= transp_rate then
  194. transp_rate = 0.5
  195. end
  196. return true,(transp_rate * 2.0),arg
  197. end
  198. end
  199. return false
  200. end,
  201. score = 4.0,
  202. group = 'html',
  203. one_shot = true,
  204. description = 'Message contains low contrast text'
  205. }
  206. rspamd_config.EXT_CSS = {
  207. callback = function(task)
  208. local regexp_lib = require "rspamd_regexp"
  209. local re = regexp_lib.create_cached('/^.*\\.css(?:[?#].*)?$/i')
  210. local tp = task:get_text_parts() -- get text parts in a message
  211. local ret = false
  212. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  213. if p:is_html() and p:get_html() then -- if the current part is html part
  214. local hc = p:get_html() -- we get HTML context
  215. hc:foreach_tag({'link'}, function(tag)
  216. local bl = tag:get_extra()
  217. if bl then
  218. local s = tostring(bl)
  219. if s and re:match(s) then
  220. ret = true
  221. end
  222. end
  223. return ret -- Continue search
  224. end)
  225. end
  226. end
  227. return ret
  228. end,
  229. score = 1.0,
  230. group = 'html',
  231. description = 'Message contains external CSS reference'
  232. }
  233. rspamd_config.HTTP_TO_HTTPS = {
  234. callback = function(task)
  235. local tp = task:get_text_parts()
  236. if (not tp) then return false end
  237. for _,p in ipairs(tp) do
  238. if p:is_html() then
  239. local hc = p:get_html()
  240. if (not hc) then return false end
  241. local found = false
  242. hc:foreach_tag('a', function (tag, length)
  243. -- Skip this loop if we already have a match
  244. if (found) then return true end
  245. local c = tag:get_content()
  246. if (c) then
  247. c = tostring(c):lower()
  248. if (not c:match('^http')) then return false end
  249. local u = tag:get_extra()
  250. if (not u) then return false end
  251. u = tostring(u):lower()
  252. if (not u:match('^http')) then return false end
  253. if ((c:match('^http:') and u:match('^https:')) or
  254. (c:match('^https:') and u:match('^http:')))
  255. then
  256. found = true
  257. return true
  258. end
  259. end
  260. return false
  261. end)
  262. if (found) then return true end
  263. return false
  264. end
  265. end
  266. return false
  267. end,
  268. description = 'Anchor text contains different scheme to target URL',
  269. score = 2.0,
  270. group = 'html'
  271. }
  272. rspamd_config.HTTP_TO_IP = {
  273. callback = function(task)
  274. local tp = task:get_text_parts()
  275. if (not tp) then return false end
  276. for _,p in ipairs(tp) do
  277. if p:is_html() then
  278. local hc = p:get_html()
  279. if (not hc) then return false end
  280. local found = false
  281. hc:foreach_tag('a', function (tag, length)
  282. if (found) then return true end
  283. local u = tag:get_extra()
  284. if (u) then
  285. u = tostring(u):lower()
  286. if (u:match('^https?://%d+%.%d+%.%d+%.%d+')) then
  287. found = true
  288. end
  289. end
  290. return false
  291. end)
  292. if found then return true end
  293. return false
  294. end
  295. end
  296. end,
  297. description = 'Anchor points to an IP address',
  298. score = 1.0,
  299. group = 'html'
  300. }