You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.lua 6.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. -- Licensed to the Apache Software Foundation (ASF) under one or more
  2. -- contributor license agreements. See the NOTICE file distributed with
  3. -- this work for additional information regarding copyright ownership.
  4. -- The ASF licenses this file to you under the Apache License, Version 2.0
  5. -- (the "License"); you may not use this file except in compliance with
  6. -- the License. You may obtain a copy of the License at:
  7. --
  8. -- http://www.apache.org/licenses/LICENSE-2.0
  9. --
  10. -- Unless required by applicable law or agreed to in writing, software
  11. -- distributed under the License is distributed on an "AS IS" BASIS,
  12. -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. -- See the License for the specific language governing permissions and
  14. -- limitations under the License.
  15. local reconf = config['regexp']
  16. local rspamd_regexp = require "rspamd_regexp"
  17. local rspamd_logger = require "rspamd_logger"
  18. -- Messages that have only HTML part
  19. reconf['MIME_HTML_ONLY'] = 'has_only_html_part()'
  20. local function check_html_image(task, min, max)
  21. local tp = task:get_text_parts()
  22. for _,p in ipairs(tp) do
  23. if p:is_html() then
  24. local hc = p:get_html()
  25. local len = p:get_length()
  26. if hc and len >= min and len < max then
  27. local images = hc:get_images()
  28. if images then
  29. for _,i in ipairs(images) do
  30. local tag = i['tag']
  31. if tag then
  32. local parent = tag:get_parent()
  33. if parent then
  34. if parent:get_type() == 'a' then
  35. -- do not trigger on small and unknown size images
  36. if i['height'] + i['width'] >= 210 or not i['embedded'] then
  37. return true
  38. end
  39. end
  40. end
  41. end
  42. end
  43. end
  44. end
  45. end
  46. end
  47. end
  48. rspamd_config.HTML_SHORT_LINK_IMG_1 = {
  49. callback = function(task)
  50. return check_html_image(task, 0, 1024)
  51. end,
  52. score = 3.0,
  53. group = 'html',
  54. description = 'Short html part (0..1K) with a link to an image'
  55. }
  56. rspamd_config.HTML_SHORT_LINK_IMG_2 = {
  57. callback = function(task)
  58. return check_html_image(task, 1024, 1536)
  59. end,
  60. score = 1.0,
  61. group = 'html',
  62. description = 'Short html part (1K..1.5K) with a link to an image'
  63. }
  64. rspamd_config.HTML_SHORT_LINK_IMG_3 = {
  65. callback = function(task)
  66. return check_html_image(task, 1536, 2048)
  67. end,
  68. score = 0.5,
  69. group = 'html',
  70. description = 'Short html part (1.5K..2K) with a link to an image'
  71. }
  72. rspamd_config.R_EMPTY_IMAGE = {
  73. callback = function(task)
  74. local tp = task:get_text_parts() -- get text parts in a message
  75. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  76. if p:is_html() then -- if the current part is html part
  77. local hc = p:get_html() -- we get HTML context
  78. local len = p:get_length() -- and part's length
  79. if hc and len < 50 then -- if we have a part that has less than 50 bytes of text
  80. local images = hc:get_images() -- then we check for HTML images
  81. if images then -- if there are images
  82. for _,i in ipairs(images) do -- then iterate over images in the part
  83. if i['height'] + i['width'] >= 400 then -- if we have a large image
  84. local tag = i['tag']
  85. if tag then
  86. local parent = tag:get_parent()
  87. if parent then
  88. if parent:get_type() ~= 'a' then
  89. return true
  90. end
  91. end
  92. end
  93. end
  94. end
  95. end
  96. end
  97. end
  98. end
  99. end,
  100. score = 2.0,
  101. group = 'html',
  102. description = 'Message contains empty parts and image'
  103. }
  104. rspamd_config.R_SUSPICIOUS_IMAGES = {
  105. callback = function(task)
  106. local tp = task:get_text_parts() -- get text parts in a message
  107. for _, p in ipairs(tp) do
  108. local h = p:get_html()
  109. if h then
  110. local l = p:get_words_count()
  111. local img = h:get_images()
  112. local pic_words = 0
  113. if img then
  114. for _, i in ipairs(img) do
  115. local dim = i['width'] + i['height']
  116. local tag = i['tag']
  117. if tag then
  118. local parent = tag:get_parent()
  119. if parent then
  120. if parent:get_type() == 'a' then
  121. -- do not trigger on small and large images
  122. if dim > 100 and dim < 3000 then
  123. -- We assume that a single picture 100x200 contains approx 3 words of text
  124. pic_words = pic_words + dim / 100
  125. end
  126. end
  127. end
  128. end
  129. end
  130. end
  131. if l + pic_words > 0 then
  132. local rel = pic_words / (l + pic_words)
  133. if rel > 0.5 then
  134. return true, (rel - 0.5) * 2
  135. end
  136. end
  137. end
  138. end
  139. return false
  140. end,
  141. score = 5.0,
  142. group = 'html',
  143. description = 'Message contains many suspicious messages'
  144. }
  145. rspamd_config.R_WHITE_ON_WHITE = {
  146. callback = function(task)
  147. local tp = task:get_text_parts() -- get text parts in a message
  148. local ret = false
  149. local diff = 0.0
  150. local normal_len = 0
  151. local transp_len = 0
  152. local arg
  153. for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs`
  154. if p:is_html() and p:get_html() then -- if the current part is html part
  155. normal_len = p:get_length()
  156. local hc = p:get_html() -- we get HTML context
  157. hc:foreach_tag('font', function(tag, len)
  158. local bl = tag:get_extra()
  159. if bl then
  160. if bl['bgcolor'] and bl['color'] then
  161. local color = bl['color']
  162. local bgcolor = bl['bgcolor']
  163. -- Should use visual approach here some day
  164. local diff_r = math.abs(color[1] - bgcolor[1]) / 255.0
  165. local diff_g = math.abs(color[2] - bgcolor[2]) / 255.0
  166. local diff_b = math.abs(color[3] - bgcolor[3]) / 255.0
  167. diff = (diff_r + diff_g + diff_b) / 3.0
  168. if diff < 0.1 then
  169. ret = true
  170. transp_len = (transp_len + tag:get_content_length()) *
  171. (0.1 - diff) * 5.0
  172. normal_len = normal_len - tag:get_content_length()
  173. if not arg then
  174. arg = string.format('%s color #%x%x%x bgcolor #%x%x%x',
  175. tostring(tag:get_type()),
  176. color[1], color[2], color[3],
  177. bgcolor[1], bgcolor[2], bgcolor[3])
  178. end
  179. else
  180. end
  181. end
  182. end
  183. return false -- Continue search
  184. end)
  185. end
  186. end
  187. if ret then
  188. if normal_len < 0 then normal_len = 0 end
  189. local transp_rate = transp_len / (normal_len + transp_len)
  190. if transp_rate > 0.1 then
  191. return true,(transp_rate * 2.0),arg
  192. end
  193. end
  194. return false
  195. end,
  196. score = 6.0,
  197. group = 'html',
  198. description = 'Message contains low contrast text'
  199. }