You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

statistics_dump.lua 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. --[[
  2. Copyright (c) 2021, Vsevolod Stakhov <vsevolod@highsecure.ru>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. local lua_redis = require "lua_redis"
  14. local rspamd_logger = require "rspamd_logger"
  15. local argparse = require "argparse"
  16. local rspamd_zstd = require "rspamd_zstd"
  17. local rspamd_text = require "rspamd_text"
  18. local rspamd_util = require "rspamd_util"
  19. local rspamd_cdb = require "rspamd_cdb"
  20. local lua_util = require "lua_util"
  21. local rspamd_i64 = require "rspamd_int64"
  22. local ucl = require "ucl"
  23. local N = "statistics_dump"
  24. local E = {}
  25. local classifiers = {}
  26. -- Define command line options
  27. local parser = argparse()
  28. :name "rspamadm statistics_dump"
  29. :description "Dump/restore Rspamd statistics"
  30. :help_description_margin(30)
  31. :command_target("command")
  32. :require_command(false)
  33. parser:option "-c --config"
  34. :description "Path to config file"
  35. :argname("<cfg>")
  36. :default(rspamd_paths["CONFDIR"] .. "/" .. "rspamd.conf")
  37. -- Extract subcommand
  38. local dump = parser:command "dump d"
  39. :description "Dump bayes statistics"
  40. dump:mutex(
  41. dump:flag "-j --json"
  42. :description "Json output",
  43. dump:flag "-C --cdb"
  44. :description "CDB output"
  45. )
  46. dump:flag "-c --compress"
  47. :description "Compress output"
  48. dump:option "-b --batch-size"
  49. :description "Number of entires to process at once"
  50. :argname("<elts>")
  51. :convert(tonumber)
  52. :default(1000)
  53. -- Restore
  54. local restore = parser:command "restore r"
  55. :description "Restore bayes statistics"
  56. restore:argument "file"
  57. :description "Input file to process"
  58. :argname "<file>"
  59. :args "*"
  60. restore:option "-b --batch-size"
  61. :description "Number of entires to process at once"
  62. :argname("<elts>")
  63. :convert(tonumber)
  64. :default(1000)
  65. restore:option "-m --mode"
  66. :description "Number of entires to process at once"
  67. :argname("<append|subtract|replace>")
  68. :convert {
  69. ['append'] = 'append',
  70. ['subtract'] = 'subtract',
  71. ['replace'] = 'replace',
  72. }
  73. :default 'append'
  74. restore:flag "-n --no-operation"
  75. :description "Only show redis commands to be issued"
  76. local function load_config(opts)
  77. local _r,err = rspamd_config:load_ucl(opts['config'])
  78. if not _r then
  79. rspamd_logger.errx('cannot parse %s: %s', opts['config'], err)
  80. os.exit(1)
  81. end
  82. _r,err = rspamd_config:parse_rcl({'logging', 'worker'})
  83. if not _r then
  84. rspamd_logger.errx('cannot process %s: %s', opts['config'], err)
  85. os.exit(1)
  86. end
  87. end
  88. local function check_redis_classifier(cls, cfg)
  89. -- Skip old classifiers
  90. if cls.new_schema then
  91. local symbol_spam, symbol_ham
  92. -- Load symbols from statfiles
  93. local function check_statfile_table(tbl, def_sym)
  94. local symbol = tbl.symbol or def_sym
  95. local spam
  96. if tbl.spam then
  97. spam = tbl.spam
  98. else
  99. if string.match(symbol:upper(), 'SPAM') then
  100. spam = true
  101. else
  102. spam = false
  103. end
  104. end
  105. if spam then
  106. symbol_spam = symbol
  107. else
  108. symbol_ham = symbol
  109. end
  110. end
  111. local statfiles = cls.statfile
  112. if statfiles[1] then
  113. for _,stf in ipairs(statfiles) do
  114. if not stf.symbol then
  115. for k,v in pairs(stf) do
  116. check_statfile_table(v, k)
  117. end
  118. else
  119. check_statfile_table(stf, 'undefined')
  120. end
  121. end
  122. else
  123. for stn,stf in pairs(statfiles) do
  124. check_statfile_table(stf, stn)
  125. end
  126. end
  127. local redis_params
  128. redis_params = lua_redis.try_load_redis_servers(cls,
  129. rspamd_config, false, 'bayes')
  130. if not redis_params then
  131. redis_params = lua_redis.try_load_redis_servers(cfg[N] or E,
  132. rspamd_config, false, 'bayes')
  133. if not redis_params then
  134. redis_params = lua_redis.try_load_redis_servers(cfg[N] or E,
  135. rspamd_config, true)
  136. if not redis_params then
  137. return false
  138. end
  139. end
  140. end
  141. table.insert(classifiers, {
  142. symbol_spam = symbol_spam,
  143. symbol_ham = symbol_ham,
  144. redis_params = redis_params,
  145. })
  146. end
  147. end
  148. local function redis_map_zip(ar)
  149. local data = {}
  150. for j=1,#ar,2 do
  151. data[ar[j]] = ar[j + 1]
  152. end
  153. return data
  154. end
  155. -- Used to clear tables
  156. local clear_fcn = table.clear or function(tbl)
  157. local keys = lua_util.keys(tbl)
  158. for _,k in ipairs(keys) do
  159. tbl[k] = nil
  160. end
  161. end
  162. local compress_ctx
  163. local function dump_out(out, opts, last)
  164. if opts.compress and not compress_ctx then
  165. compress_ctx = rspamd_zstd.compress_ctx()
  166. end
  167. if compress_ctx then
  168. if last then
  169. compress_ctx:stream(rspamd_text.fromtable(out), 'end'):write()
  170. else
  171. compress_ctx:stream(rspamd_text.fromtable(out), 'flush'):write()
  172. end
  173. else
  174. for _,o in ipairs(out) do
  175. io.write(o)
  176. end
  177. end
  178. end
  179. local function dump_cdb(out, opts, last, pattern)
  180. local results = out[pattern]
  181. if not out.cdb_builder then
  182. -- First invocation
  183. out.cdb_builder = rspamd_cdb.build(string.format('%s.cdb', pattern))
  184. out.cdb_builder:add('_lrnspam', rspamd_i64.fromstring(results.learns_spam or '0'))
  185. out.cdb_builder:add('_lrnham_', rspamd_i64.fromstring(results.learns_ham or '0'))
  186. end
  187. for _,o in ipairs(results.elts) do
  188. out.cdb_builder:add(o.key, o.value)
  189. end
  190. if last then
  191. out.cdb_builder:finalize()
  192. out.cdb_builder = nil
  193. end
  194. end
  195. local function dump_pattern(conn, pattern, opts, out, key)
  196. local cursor = 0
  197. repeat
  198. conn:add_cmd('SCAN', {tostring(cursor),
  199. 'MATCH', pattern,
  200. 'COUNT', tostring(opts.batch_size)})
  201. local ret, results = conn:exec()
  202. if not ret then
  203. rspamd_logger.errx("cannot connect execute scan command: %s", results)
  204. os.exit(1)
  205. end
  206. cursor = tonumber(results[1])
  207. local elts = results[2]
  208. local tokens = {}
  209. for _,e in ipairs(elts) do
  210. conn:add_cmd('HGETALL', {e})
  211. end
  212. -- This function returns many results, each for each command
  213. -- So if we have batch 1000, then we would have 1000 tables in form
  214. -- [result, {hash_content}]
  215. local all_results = {conn:exec()}
  216. for i=1,#all_results,2 do
  217. local r, hash_content = all_results[i], all_results[i + 1]
  218. if r then
  219. -- List to a hash map
  220. local data = redis_map_zip(hash_content)
  221. tokens[#tokens + 1] = {key = elts[(i + 1)/2], data = data}
  222. end
  223. end
  224. -- Output keeping track of the commas
  225. for i,d in ipairs(tokens) do
  226. if cursor == 0 and i == #tokens or not opts.json then
  227. if opts.cdb then
  228. table.insert(out[key].elts, {
  229. key = rspamd_i64.fromstring(string.match(d.key, '%d+')),
  230. value = rspamd_util.pack('<n<n', tonumber(d.data["S"] or '0') or 0,
  231. tonumber(d.data["H"] or '0'))
  232. })
  233. else
  234. out[#out + 1] = rspamd_logger.slog('"%s": %s\n', d.key,
  235. ucl.to_format(d.data, "json-compact"))
  236. end
  237. else
  238. out[#out + 1] = rspamd_logger.slog('"%s": %s,\n', d.key,
  239. ucl.to_format(d.data, "json-compact"))
  240. end
  241. end
  242. if opts.json and cursor == 0 then
  243. out[#out + 1] = '}}\n'
  244. end
  245. -- Do not write the last chunk of out as it will be processed afterwards
  246. if not cursor == 0 then
  247. if opts.cdb then
  248. dump_out(out, opts, false)
  249. clear_fcn(out)
  250. else
  251. dump_cdb(out, opts, false, key)
  252. out[key].elts = {}
  253. end
  254. elseif opts.cdb then
  255. dump_cdb(out, opts, true, key)
  256. end
  257. until cursor == 0
  258. end
  259. local function dump_handler(opts)
  260. local patterns_seen = {}
  261. for _,cls in ipairs(classifiers) do
  262. local res,conn = lua_redis.redis_connect_sync(cls.redis_params, false)
  263. if not res then
  264. rspamd_logger.errx("cannot connect to redis server: %s", cls.redis_params)
  265. os.exit(1)
  266. end
  267. local out = {}
  268. local function check_keys(sym)
  269. local sym_keys_pattern = string.format("%s_keys", sym)
  270. conn:add_cmd('SMEMBERS', { sym_keys_pattern })
  271. local ret,keys = conn:exec()
  272. if not ret then
  273. rspamd_logger.errx("cannot execute command to get keys: %s", keys)
  274. os.exit(1)
  275. end
  276. if not opts.json then
  277. out[#out + 1] = string.format('"%s": %s\n', sym_keys_pattern,
  278. ucl.to_format(keys, 'json-compact'))
  279. end
  280. for _,k in ipairs(keys) do
  281. local pat = string.format('%s*_*', k)
  282. if not patterns_seen[pat] then
  283. conn:add_cmd('HGETALL', {k})
  284. local _ret,additional_keys = conn:exec()
  285. if _ret then
  286. if opts.json then
  287. out[#out + 1] = string.format('{"pattern": "%s", "meta": %s, "elts": {\n',
  288. k, ucl.to_format(redis_map_zip(additional_keys), 'json-compact'))
  289. elseif opts.cdb then
  290. out[k] = redis_map_zip(additional_keys)
  291. out[k].elts = {}
  292. else
  293. out[#out + 1] = string.format('"%s": %s\n', k,
  294. ucl.to_format(redis_map_zip(additional_keys), 'json-compact'))
  295. end
  296. dump_pattern(conn, pat, opts, out, k)
  297. patterns_seen[pat] = true
  298. end
  299. end
  300. end
  301. end
  302. check_keys(cls.symbol_spam)
  303. check_keys(cls.symbol_ham)
  304. if #out > 0 then
  305. dump_out(out, opts, true)
  306. end
  307. end
  308. end
  309. local function obj_to_redis_arguments(obj, opts, cmd_pipe)
  310. local key,value = next(obj)
  311. if type(key) == 'string' then
  312. if type(value) == 'table' then
  313. if not value[1] then
  314. if opts.mode == 'replace' then
  315. local cmd = 'HMSET'
  316. local params = {key}
  317. for k,v in pairs(value) do
  318. table.insert(params, k)
  319. table.insert(params, v)
  320. end
  321. table.insert(cmd_pipe, {cmd, params})
  322. else
  323. local cmd = 'HINCRBYFLOAT'
  324. local mult = 1.0
  325. if opts.mode == 'subtract' then
  326. mult = (-mult)
  327. end
  328. for k,v in pairs(value) do
  329. if tonumber(v) then
  330. v = tonumber(v)
  331. table.insert(cmd_pipe, {cmd, {key, k, tostring(v * mult)}})
  332. else
  333. table.insert(cmd_pipe, {'HSET', {key, k, v}})
  334. end
  335. end
  336. end
  337. else
  338. -- Numeric table of elements (e.g. _keys) - it is actually a set in Redis
  339. for _,elt in ipairs(value) do
  340. table.insert(cmd_pipe, {'SADD', {key, elt}})
  341. end
  342. end
  343. end
  344. end
  345. return cmd_pipe
  346. end
  347. local function execute_batch(batch, conns, opts)
  348. local cmd_pipe = {}
  349. for _,cmd in ipairs(batch) do
  350. obj_to_redis_arguments(cmd, opts, cmd_pipe)
  351. end
  352. if opts.no_operation then
  353. for _,cmd in ipairs(cmd_pipe) do
  354. rspamd_logger.messagex('%s %s', cmd[1], table.concat(cmd[2], ' '))
  355. end
  356. else
  357. for _, conn in ipairs(conns) do
  358. for _,cmd in ipairs(cmd_pipe) do
  359. local is_ok, err = conn:add_cmd(cmd[1], cmd[2])
  360. if not is_ok then
  361. rspamd_logger.errx("cannot add command: %s with args: %s: %s", cmd[1], cmd[2], err)
  362. end
  363. end
  364. conn:exec()
  365. end
  366. end
  367. end
  368. local function restore_handler(opts)
  369. local files = opts.file or {'-'}
  370. local conns = {}
  371. for _,cls in ipairs(classifiers) do
  372. local res,conn = lua_redis.redis_connect_sync(cls.redis_params, true)
  373. if not res then
  374. rspamd_logger.errx("cannot connect to redis server: %s", cls.redis_params)
  375. os.exit(1)
  376. end
  377. table.insert(conns, conn)
  378. end
  379. local batch = {}
  380. for _,f in ipairs(files) do
  381. local fd
  382. if f ~= '-' then
  383. fd = io.open(f, 'r')
  384. io.input(fd)
  385. end
  386. local cur_line = 1
  387. for line in io.lines() do
  388. local ucl_parser = ucl.parser()
  389. local res, err
  390. res,err = ucl_parser:parse_string(line)
  391. if not res then
  392. rspamd_logger.errx("%s: cannot read line %s: %s", f, cur_line, err)
  393. os.exit(1)
  394. end
  395. table.insert(batch, ucl_parser:get_object())
  396. cur_line = cur_line + 1
  397. if cur_line % opts.batch_size == 0 then
  398. execute_batch(batch, conns, opts)
  399. batch = {}
  400. end
  401. end
  402. if fd then fd:close() end
  403. end
  404. if #batch > 0 then
  405. execute_batch(batch, conns, opts)
  406. end
  407. end
  408. local function handler(args)
  409. local opts = parser:parse(args)
  410. local command = opts.command or 'dump'
  411. load_config(opts)
  412. rspamd_config:init_subsystem('stat')
  413. local obj = rspamd_config:get_ucl()
  414. local classifier = obj.classifier
  415. if classifier then
  416. if classifier[1] then
  417. for _,cls in ipairs(classifier) do
  418. if cls.bayes then cls = cls.bayes end
  419. if cls.backend and cls.backend == 'redis' then
  420. check_redis_classifier(cls, obj)
  421. end
  422. end
  423. else
  424. if classifier.bayes then
  425. classifier = classifier.bayes
  426. if classifier[1] then
  427. for _,cls in ipairs(classifier) do
  428. if cls.backend and cls.backend == 'redis' then
  429. check_redis_classifier(cls, obj)
  430. end
  431. end
  432. else
  433. if classifier.backend and classifier.backend == 'redis' then
  434. check_redis_classifier(classifier, obj)
  435. end
  436. end
  437. end
  438. end
  439. end
  440. if type(opts.file) == 'string' then
  441. opts.file = {opts.file}
  442. elseif type(opts.file) == 'none' then
  443. opts.file = {}
  444. end
  445. if command == 'dump' then
  446. dump_handler(opts)
  447. elseif command == 'restore' then
  448. restore_handler(opts)
  449. else
  450. parser:error('command %s is not implemented', command)
  451. end
  452. end
  453. return {
  454. name = 'statistics_dump',
  455. aliases = {'stat_dump', 'bayes_dump'},
  456. handler = handler,
  457. description = parser._description
  458. }