diff options
Diffstat (limited to 'lualib/lua_meta.lua')
-rw-r--r-- | lualib/lua_meta.lua | 396 |
1 files changed, 396 insertions, 0 deletions
diff --git a/lualib/lua_meta.lua b/lualib/lua_meta.lua new file mode 100644 index 000000000..96404192d --- /dev/null +++ b/lualib/lua_meta.lua @@ -0,0 +1,396 @@ +--[[ +Copyright (c) 2017, Vsevolod Stakhov <vsevolod@highsecure.ru> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +local exports = {} + +local N = "metatokens" + +-- Metafunctions +local function meta_size_function(task) + local sizes = { + 100, + 200, + 500, + 1000, + 2000, + 4000, + 10000, + 20000, + 30000, + 100000, + 200000, + 400000, + 800000, + 1000000, + 2000000, + 8000000, + } + + local size = task:get_size() + for i = 1,#sizes do + if sizes[i] >= size then + return {(1.0 * i) / #sizes} + end + end + + return {0} +end + +local function meta_images_function(task) + local images = task:get_images() + local ntotal = 0 + local njpg = 0 + local npng = 0 + local nlarge = 0 + local nsmall = 0 + + if images then + for _,img in ipairs(images) do + if img:get_type() == 'png' then + npng = npng + 1 + elseif img:get_type() == 'jpeg' then + njpg = njpg + 1 + end + + local w = img:get_width() + local h = img:get_height() + + if w > 0 and h > 0 then + if w + h > 256 then + nlarge = nlarge + 1 + else + nsmall = nsmall + 1 + end + end + + ntotal = ntotal + 1 + end + end + if ntotal > 0 then + njpg = 1.0 * njpg / ntotal + npng = 1.0 * npng / ntotal + nlarge = 1.0 * nlarge / ntotal + nsmall = 1.0 * nsmall / ntotal + end + return {ntotal,njpg,npng,nlarge,nsmall} +end + +local function meta_nparts_function(task) + local nattachments = 0 + local ntextparts = 0 + local totalparts = 1 + + local tp = task:get_text_parts() + if tp then + ntextparts = #tp + end + + local parts = task:get_parts() + + if parts then + for _,p in ipairs(parts) do + if p:get_filename() then + nattachments = nattachments + 1 + end + totalparts = totalparts + 1 + end + end + + return {(1.0 * ntextparts)/totalparts, (1.0 * nattachments)/totalparts} +end + +local function meta_encoding_function(task) + local nutf = 0 + local nother = 0 + + local tp = task:get_text_parts() + if tp and #tp > 0 then + for _,p in ipairs(tp) do + if p:is_utf() then + nutf = nutf + 1 + else + nother = nother + 1 + end + end + + return {nutf / #tp, nother / #tp} + end + + return {0, 0} +end + +local function meta_recipients_function(task) + local nmime = 0 + local nsmtp = 0 + + if task:has_recipients('mime') then + nmime = #(task:get_recipients('mime')) + end + if task:has_recipients('smtp') then + nsmtp = #(task:get_recipients('smtp')) + end + + if nmime > 0 then nmime = 1.0 / nmime end + if nsmtp > 0 then nsmtp = 1.0 / nsmtp end + + return {nmime,nsmtp} +end + +local function meta_received_function(task) + local count_factor = 0 + local invalid_factor = 0 + local rh = task:get_received_headers() + local time_factor = 0 + local secure_factor = 0 + local fun = require "fun" + + if rh and #rh > 0 then + + local ntotal = 0.0 + local init_time = 0 + + fun.each(function(rc) + ntotal = ntotal + 1.0 + + if not rc.by_hostname then + invalid_factor = invalid_factor + 1.0 + end + if init_time == 0 and rc.timestamp then + init_time = rc.timestamp + elseif rc.timestamp then + time_factor = time_factor + math.abs(init_time - rc.timestamp) + init_time = rc.timestamp + end + if rc.flags and (rc.flags['ssl'] or rc.flags['authenticated']) then + secure_factor = secure_factor + 1.0 + end + end, + fun.filter(function(rc) return not rc.flags or not rc.flags['artificial'] end, rh)) + + invalid_factor = invalid_factor / ntotal + secure_factor = secure_factor / ntotal + count_factor = 1.0 / ntotal + + if time_factor ~= 0 then + time_factor = 1.0 / time_factor + end + end + + return {count_factor, invalid_factor, time_factor, secure_factor} +end + +local function meta_urls_function(task) + if task:has_urls() then + return {1.0 / #(task:get_urls())} + end + + return {0} +end + +local function meta_words_function(task) + local avg_len = task:get_mempool():get_variable("avg_words_len", "double") or 0.0 + local short_words = task:get_mempool():get_variable("short_words_cnt", "double") or 0.0 + local ret_len = 0 + + local lens = { + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 15, + 20, + } + + for i = 1,#lens do + if lens[i] >= avg_len then + ret_len = (1.0 * i) / #lens + break + end + end + + local tp = task:get_text_parts() + local wres = { + 0, -- spaces rate + 0, -- double spaces rate + 0, -- non spaces rate + 0, -- ascii characters rate + 0, -- non-ascii characters rate + 0, -- capital characters rate + 0, -- numeric cahracters + } + for _,p in ipairs(tp) do + local stats = p:get_stats() + local len = p:get_length() + + if len > 0 then + wres[1] = wres[1] + stats['spaces'] / len + wres[2] = wres[2] + stats['double_spaces'] / len + wres[3] = wres[3] + stats['non_spaces'] / len + wres[4] = wres[4] + stats['ascii_characters'] / len + wres[5] = wres[5] + stats['non_ascii_characters'] / len + wres[6] = wres[6] + stats['capital_letters'] / len + wres[7] = wres[7] + stats['numeric_characters'] / len + end + end + + local ret = { + short_words, + ret_len, + } + + local divisor = 1.0 + if #tp > 0 then + divisor = #tp + end + + for _,wr in ipairs(wres) do + table.insert(ret, wr / divisor) + end + + return ret +end + +local metafunctions = { + { + cb = meta_size_function, + ninputs = 1, + desc = { + "size" + } + }, + { + cb = meta_images_function, + ninputs = 5, + -- 1 - number of images, + -- 2 - number of png images, + -- 3 - number of jpeg images + -- 4 - number of large images (> 128 x 128) + -- 5 - number of small images (< 128 x 128) + desc = { + 'nimages', + 'npng_images', + 'njpeg_images', + 'nlarge_images', + 'nsmall_images' + } + }, + { + cb = meta_nparts_function, + ninputs = 2, + -- 1 - number of text parts + -- 2 - number of attachments + desc = { + 'ntext_parts', + 'nattachments' + } + }, + { + cb = meta_encoding_function, + ninputs = 2, + -- 1 - number of utf parts + -- 2 - number of non-utf parts + desc = { + 'nutf_parts', + 'nascii_parts' + } + }, + { + cb = meta_recipients_function, + ninputs = 2, + -- 1 - number of mime rcpt + -- 2 - number of smtp rcpt + desc = { + 'nmime_rcpt', + 'nsmtp_rcpt' + } + }, + { + cb = meta_received_function, + ninputs = 4, + desc = { + 'nreceived', + 'nreceived_invalid', + 'nreceived_bad_time', + 'nreceived_secure' + } + }, + { + cb = meta_urls_function, + ninputs = 1, + desc = { + 'nurls' + } + }, + { + cb = meta_words_function, + ninputs = 9, + desc = { + 'avg_words_len', + 'nshort_words', + 'spaces_rate', + 'double_spaces_rate', + 'non_spaces_rate', + 'ascii_characters_rate', + 'non_ascii_characters_rate', + 'capital_characters_rate', + 'numeric_cahracters' + } + }, +} + +local function rspamd_gen_metatokens(task) + local rspamd_logger = require "rspamd_logger" + local ipairs = ipairs + local metatokens = {} + local cached = task:cache_get('metatokens') + + if cached then + return cached + else + for _,mt in ipairs(metafunctions) do + local ct = mt.cb(task) + for i,tok in ipairs(ct) do + rspamd_logger.debugm(N, task, "metatoken: %s = %s", mt.desc[i], tok) + table.insert(metatokens, tok) + end + end + + task:cache_set('metatokens', metatokens) + end + + return metatokens +end + +exports.rspamd_gen_metatokens = rspamd_gen_metatokens + +local function rspamd_count_metatokens() + local ipairs = ipairs + local total = 0 + for _,mt in ipairs(metafunctions) do + total = total + mt.ninputs + end + + return total +end + +exports.rspamd_count_metatokens = rspamd_count_metatokens + +return exports |