aboutsummaryrefslogtreecommitdiffstats
path: root/lualib/lua_fuzzy.lua
blob: 2ce18bb62addc9608d7b767fe899376bf714405a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
--[[
Copyright (c) 2018, Vsevolod Stakhov <vsevolod@highsecure.ru>

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]--

--[[[
-- @module lua_fuzzy
-- This module contains helper functions for supporting fuzzy check module
--]]


local N = "lua_fuzzy"
local lua_util = require "lua_util"
local rspamd_logger = require "rspamd_logger"
local ts = require("tableshape").types

-- Filled by C code, indexed by number in this table
local rules = {}

-- Pre-defined rules options
local policies = {
  recommended = {
    min_bytes = 1024,
    min_height = 500,
    min_width = 500,
    min_length = 32,
    text_multiplier = 4.0, -- divide min_bytes by 4 for texts
    mime_types = {"application/*"},
    scan_archives = true,
    short_text_direct_hash = true,
    text_shingles = true,
    skip_images = false,
  }
}

local default_policy = policies.recommended

local policy_schema = ts.shape{
  min_bytes = ts.number + ts.string / tonumber,
  min_height = ts.number + ts.string / tonumber,
  min_width = ts.number + ts.string / tonumber,
  min_length = ts.number + ts.string / tonumber,
  text_multiplier = ts.number,
  mime_types = ts.array_of(ts.string),
  scan_archives = ts.bool,
  short_text_direct_hash = ts.bool,
  text_shingles = ts.bool,
  skip_imagess = ts.bool,
}


local exports = {}


--[[[
-- @function lua_fuzzy.register_policy(name, policy)
-- Adds a new policy with name `name`. Must be valid, checked using policy_schema
--]]
exports.register_policy = function(name, policy)
  if policies[name] then
    rspamd_logger.warnx(rspamd_config, "overriding policy %s", name)
  end

  local parsed_policy,err = policy_schema:transform(policy)

  if not parsed_policy then
    rspamd_logger.errx(rspamd_config, 'invalid fuzzy rule policy %s: %s',
        name, err)

    return
  else
    policies.name = parsed_policy
  end
end

--[[[
-- @function lua_fuzzy.process_rule(rule)
-- Processes fuzzy rule (applying policies or defaults if needed). Returns policy id
--]]
exports.process_rule = function(rule)
  local processed_rule = lua_util.shallowcopy(rule)
  local policy = default_policy

  if processed_rule.policy then
    policy = policies[processed_rule.policy]

    if policy then
      processed_rule = lua_util.override_defaults(policy, processed_rule)
    else
      rspamd_logger.warnx(rspamd_config, "unknown policy %s", processed_rule.policy)
    end
  end

  table.insert(rules, processed_rule)
  return #rules
end

local function check_length(task, part, rule)
  local length_ok = true

  if rule.min_bytes then
    local bytes = part:get_length()
    local adjusted_bytes = bytes

    if part:is_text() then
      if rule.text_multiplier then
        adjusted_bytes = bytes * rule.text_multiplier
      end
    end

    if rule.min_bytes > adjusted_bytes then
      lua_util.debugm(N, task, 'skip part of length %s (%s adjusted)' ..
          'as it has less than %s bytes',
          bytes, adjusted_bytes, rule.min_bytes)
      length_ok = false
    end
  end

  return length_ok
end

local function check_text_part(task, part, rule, text)
  local allow_direct,allow_shingles = false,false

  if rule.text_shingles then
    -- Check number of words
    local wcnt = text:get_words_count()
    if rule.min_length and wcnt < rule.min_length then
      lua_util.debugm(N, task, 'text has less than %s words: %s',
          rule.min_length, wcnt)
      allow_shingles = false
    else
      allow_shingles = true
    end

    if not rule.short_text_direct_hash and not allow_shingles then
      allow_direct = false
    else
      allow_direct = check_length(task, part, rule)
    end

  else
    allow_direct = check_length(task, part, rule)
  end

  return allow_direct,allow_shingles
end

local function check_image_part(task, part, rule, image)
  if rule.skip_images then
    lua_util.debugm(N, task, 'skip image part as images are disabled')
    return false,false
  end

  if rule.min_width or rule.min_height then
    -- Check dimensions
    local min_width = rule.min_width or rule.min_height
    local min_height = rule.min_height or rule.min_width
    local height = image:get_height()
    local width = image:get_width()

    if height and width then
      if height < min_height or width < min_width then
        lua_util.debugm(N, task, 'skip image part as it does not meet minimum sizes: %sx%s < %sx%s',
          width, height, min_width, min_height)

        return false, false
      end
    end
  end

  return check_length(task, part, rule),false
end

local function mime_types_check(task, part, rule)
  return true,true -- TODO: add checks
end

exports.check_mime_part = function(task, part, rule_id)
  local rule = rules[rule_id]

  if not rule then
    rspamd_logger.errx(task, 'cannot find rule with id %s', rule_id)

    return false,false
  end

  if part:is_text() then
    return check_text_part(task, part, rule, part:get_text())
  end

  if part:is_image() then
    return check_image_part(task, part, rule, part:get_image())
  end

  if part:is_archive() and rule.scan_archives then
    -- Always send archives
    return true,false
  end

  return mime_types_check(task, part, rule)
end

return exports