You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

clickhouse.lua 49KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. local rspamd_logger = require 'rspamd_logger'
  14. local upstream_list = require "rspamd_upstream_list"
  15. local lua_util = require "lua_util"
  16. local lua_clickhouse = require "lua_clickhouse"
  17. local lua_settings = require "lua_settings"
  18. local fun = require "fun"
  19. local N = "clickhouse"
  20. if confighelp then
  21. return
  22. end
  23. local data_rows = {}
  24. local custom_rows = {}
  25. local nrows = 0
  26. local used_memory = 0
  27. local last_collection = 0
  28. local final_call = false -- If the final collection has been started
  29. local schema_version = 9 -- Current schema version
  30. local settings = {
  31. limits = { -- Collection limits
  32. max_rows = 1000, -- How many rows are allowed (0 for disable this)
  33. max_memory = 50 * 1024 * 1024, -- How many memory should be occupied before sending collection
  34. max_interval = 60, -- Maximum collection interval
  35. },
  36. collect_garbage = false, -- Perform GC collection after sending the data
  37. check_timeout = 10.0, -- Periodic timeout
  38. timeout = 5.0,
  39. bayes_spam_symbols = {'BAYES_SPAM'},
  40. bayes_ham_symbols = {'BAYES_HAM'},
  41. ann_symbols_spam = {'NEURAL_SPAM'},
  42. ann_symbols_ham = {'NEURAL_HAM'},
  43. fuzzy_symbols = {'FUZZY_DENIED'},
  44. whitelist_symbols = {'WHITELIST_DKIM', 'WHITELIST_SPF_DKIM', 'WHITELIST_DMARC'},
  45. dkim_allow_symbols = {'R_DKIM_ALLOW'},
  46. dkim_reject_symbols = {'R_DKIM_REJECT'},
  47. dkim_dnsfail_symbols = {'R_DKIM_TEMPFAIL', 'R_DKIM_PERMFAIL'},
  48. dkim_na_symbols = {'R_DKIM_NA'},
  49. dmarc_allow_symbols = {'DMARC_POLICY_ALLOW'},
  50. dmarc_reject_symbols = {'DMARC_POLICY_REJECT'},
  51. dmarc_quarantine_symbols = {'DMARC_POLICY_QUARANTINE'},
  52. dmarc_softfail_symbols = {'DMARC_POLICY_SOFTFAIL'},
  53. dmarc_na_symbols = {'DMARC_NA'},
  54. spf_allow_symbols = {'R_SPF_ALLOW'},
  55. spf_reject_symbols = {'R_SPF_FAIL'},
  56. spf_dnsfail_symbols = {'R_SPF_DNSFAIL', 'R_SPF_PERMFAIL'},
  57. spf_neutral_symbols = {'R_DKIM_TEMPFAIL', 'R_DKIM_PERMFAIL'},
  58. spf_na_symbols = {'R_SPF_NA'},
  59. stop_symbols = {},
  60. ipmask = 19,
  61. ipmask6 = 48,
  62. full_urls = false,
  63. from_tables = nil,
  64. enable_symbols = false,
  65. database = 'default',
  66. use_https = false,
  67. use_gzip = true,
  68. allow_local = false,
  69. insert_subject = false,
  70. subject_privacy = false, -- subject privacy is off
  71. subject_privacy_alg = 'blake2', -- default hash-algorithm to obfuscate subject
  72. subject_privacy_prefix = 'obf', -- prefix to show it's obfuscated
  73. subject_privacy_length = 16, -- cut the length of the hash
  74. schema_additions = {}, -- additional SQL statements to be executed when schema is uploaded
  75. user = nil,
  76. password = nil,
  77. no_ssl_verify = false,
  78. custom_rules = {},
  79. enable_digest = false,
  80. exceptions = nil,
  81. retention = {
  82. enable = false,
  83. method = 'detach',
  84. period_months = 3,
  85. run_every = '7d',
  86. },
  87. extra_columns = {},
  88. }
  89. --- @language SQL
  90. local clickhouse_schema = {[[
  91. CREATE TABLE IF NOT EXISTS rspamd
  92. (
  93. Date Date COMMENT 'Date (used for partitioning)',
  94. TS DateTime COMMENT 'Date and time of the request start (UTC)',
  95. From String COMMENT 'Domain part of the return address (RFC5321.MailFrom)',
  96. MimeFrom String COMMENT 'Domain part of the address in From: header (RFC5322.From)',
  97. IP String COMMENT 'SMTP client IP as provided by MTA or from Received: header',
  98. Helo String COMMENT 'Full hostname as sent by the SMTP client (RFC5321.HELO/.EHLO)',
  99. Score Float32 COMMENT 'Message score',
  100. NRcpt UInt8 COMMENT 'Number of envelope recipients (RFC5321.RcptTo)',
  101. Size UInt32 COMMENT 'Message size in bytes',
  102. IsWhitelist Enum8('blacklist' = 0, 'whitelist' = 1, 'unknown' = 2) DEFAULT 'unknown' COMMENT 'Based on symbols configured in `whitelist_symbols` module option',
  103. IsBayes Enum8('ham' = 0, 'spam' = 1, 'unknown' = 2) DEFAULT 'unknown' COMMENT 'Based on symbols configured in `bayes_spam_symbols` and `bayes_ham_symbols` module options',
  104. IsFuzzy Enum8('whitelist' = 0, 'deny' = 1, 'unknown' = 2) DEFAULT 'unknown' COMMENT 'Based on symbols configured in `fuzzy_symbols` module option',
  105. IsFann Enum8('ham' = 0, 'spam' = 1, 'unknown' = 2) DEFAULT 'unknown' COMMENT 'Based on symbols configured in `ann_symbols_spam` and `ann_symbols_ham` module options',
  106. IsDkim Enum8('reject' = 0, 'allow' = 1, 'unknown' = 2, 'dnsfail' = 3, 'na' = 4) DEFAULT 'unknown' COMMENT 'Based on symbols configured in dkim_* module options',
  107. IsDmarc Enum8('reject' = 0, 'allow' = 1, 'unknown' = 2, 'softfail' = 3, 'na' = 4, 'quarantine' = 5) DEFAULT 'unknown' COMMENT 'Based on symbols configured in dmarc_* module options',
  108. IsSpf Enum8('reject' = 0, 'allow' = 1, 'neutral' = 2, 'dnsfail' = 3, 'na' = 4, 'unknown' = 5) DEFAULT 'unknown' COMMENT 'Based on symbols configured in spf_* module options',
  109. NUrls Int32 COMMENT 'Number of URLs and email extracted from the message',
  110. Action Enum8('reject' = 0, 'rewrite subject' = 1, 'add header' = 2, 'greylist' = 3, 'no action' = 4, 'soft reject' = 5, 'custom' = 6) DEFAULT 'no action' COMMENT 'Action returned for the message; if action is not predefined actual action will be in `CustomAction` field',
  111. CustomAction LowCardinality(String) COMMENT 'Action string for custom action',
  112. FromUser String COMMENT 'Local part of the return address (RFC5321.MailFrom)',
  113. MimeUser String COMMENT 'Local part of the address in From: header (RFC5322.From)',
  114. RcptUser String COMMENT '[Deprecated] Local part of the first envelope recipient (RFC5321.RcptTo)',
  115. RcptDomain String COMMENT '[Deprecated] Domain part of the first envelope recipient (RFC5321.RcptTo)',
  116. SMTPRecipients Array(String) COMMENT 'List of envelope recipients (RFC5321.RcptTo)',
  117. MimeRecipients Array(String) COMMENT 'List of recipients from headers (RFC5322.To/.CC/.BCC)',
  118. MessageId String COMMENT 'Message-ID header',
  119. ListId String COMMENT 'List-Id header',
  120. Subject String COMMENT 'Subject header (or hash if `subject_privacy` module option enabled)',
  121. `Attachments.FileName` Array(String) COMMENT 'Attachment name',
  122. `Attachments.ContentType` Array(String) COMMENT 'Attachment Content-Type',
  123. `Attachments.Length` Array(UInt32) COMMENT 'Attachment size in bytes',
  124. `Attachments.Digest` Array(FixedString(16)) COMMENT 'First 16 characters of hash returned by mime_part:get_digest()',
  125. `Urls.Tld` Array(String) COMMENT 'Effective second level domain part of the URL host',
  126. `Urls.Url` Array(String) COMMENT 'Full URL if `full_urls` module option enabled, host part of URL otherwise',
  127. `Urls.Flags` Array(UInt32) COMMENT 'Corresponding url flags, see `enum rspamd_url_flags` in libserver/url.h for details',
  128. Emails Array(String) COMMENT 'List of emails extracted from the message',
  129. ASN UInt32 COMMENT 'BGP AS number for SMTP client IP (returned by asn.rspamd.com or asn6.rspamd.com)',
  130. Country FixedString(2) COMMENT 'Country for SMTP client IP (returned by asn.rspamd.com or asn6.rspamd.com)',
  131. IPNet String,
  132. `Symbols.Names` Array(LowCardinality(String)) COMMENT 'Symbol name',
  133. `Symbols.Scores` Array(Float32) COMMENT 'Symbol score',
  134. `Symbols.Options` Array(String) COMMENT 'Symbol options (comma separated list)',
  135. `Groups.Names` Array(LowCardinality(String)) COMMENT 'Group name',
  136. `Groups.Scores` Array(Float32) COMMENT 'Group score',
  137. ScanTimeReal UInt32 COMMENT 'Request time in milliseconds',
  138. ScanTimeVirtual UInt32 COMMENT 'Deprecated do not use',
  139. AuthUser String COMMENT 'Username for authenticated SMTP client',
  140. SettingsId LowCardinality(String) COMMENT 'ID for the settings profile',
  141. Digest FixedString(32) COMMENT '[Deprecated]',
  142. SMTPFrom ALIAS if(From = '', '', concat(FromUser, '@', From)) COMMENT 'Return address (RFC5321.MailFrom)',
  143. SMTPRcpt ALIAS SMTPRecipients[1] COMMENT 'The first envelope recipient (RFC5321.RcptTo)',
  144. MIMEFrom ALIAS if(MimeFrom = '', '', concat(MimeUser, '@', MimeFrom)) COMMENT 'Address in From: header (RFC5322.From)',
  145. MIMERcpt ALIAS MimeRecipients[1] COMMENT 'The first recipient from headers (RFC5322.To/.CC/.BCC)'
  146. ) ENGINE = MergeTree()
  147. PARTITION BY toMonday(Date)
  148. ORDER BY TS
  149. ]],
  150. [[CREATE TABLE IF NOT EXISTS rspamd_version ( Version UInt32) ENGINE = TinyLog]],
  151. {[[INSERT INTO rspamd_version (Version) Values (${SCHEMA_VERSION})]], true},
  152. }
  153. -- This describes SQL queries to migrate between versions
  154. local migrations = {
  155. [1] = {
  156. -- Move to a wide fat table
  157. [[ALTER TABLE rspamd
  158. ADD COLUMN IF NOT EXISTS `Attachments.FileName` Array(String) AFTER ListId,
  159. ADD COLUMN IF NOT EXISTS `Attachments.ContentType` Array(String) AFTER `Attachments.FileName`,
  160. ADD COLUMN IF NOT EXISTS `Attachments.Length` Array(UInt32) AFTER `Attachments.ContentType`,
  161. ADD COLUMN IF NOT EXISTS `Attachments.Digest` Array(FixedString(16)) AFTER `Attachments.Length`,
  162. ADD COLUMN IF NOT EXISTS `Urls.Tld` Array(String) AFTER `Attachments.Digest`,
  163. ADD COLUMN IF NOT EXISTS `Urls.Url` Array(String) AFTER `Urls.Tld`,
  164. ADD COLUMN IF NOT EXISTS Emails Array(String) AFTER `Urls.Url`,
  165. ADD COLUMN IF NOT EXISTS ASN UInt32 AFTER Emails,
  166. ADD COLUMN IF NOT EXISTS Country FixedString(2) AFTER ASN,
  167. ADD COLUMN IF NOT EXISTS IPNet String AFTER Country,
  168. ADD COLUMN IF NOT EXISTS `Symbols.Names` Array(String) AFTER IPNet,
  169. ADD COLUMN IF NOT EXISTS `Symbols.Scores` Array(Float64) AFTER `Symbols.Names`,
  170. ADD COLUMN IF NOT EXISTS `Symbols.Options` Array(String) AFTER `Symbols.Scores`]],
  171. -- Add explicit version
  172. [[CREATE TABLE rspamd_version ( Version UInt32) ENGINE = TinyLog]],
  173. [[INSERT INTO rspamd_version (Version) Values (2)]],
  174. },
  175. [2] = {
  176. -- Add `Subject` column
  177. [[ALTER TABLE rspamd
  178. ADD COLUMN IF NOT EXISTS Subject String AFTER ListId]],
  179. -- New version
  180. [[INSERT INTO rspamd_version (Version) Values (3)]],
  181. },
  182. [3] = {
  183. [[ALTER TABLE rspamd
  184. ADD COLUMN IF NOT EXISTS IsSpf Enum8('reject' = 0, 'allow' = 1, 'neutral' = 2, 'dnsfail' = 3, 'na' = 4, 'unknown' = 5) DEFAULT 'unknown' AFTER IsDmarc,
  185. MODIFY COLUMN IsDkim Enum8('reject' = 0, 'allow' = 1, 'unknown' = 2, 'dnsfail' = 3, 'na' = 4) DEFAULT 'unknown',
  186. MODIFY COLUMN IsDmarc Enum8('reject' = 0, 'allow' = 1, 'unknown' = 2, 'softfail' = 3, 'na' = 4, 'quarantine' = 5) DEFAULT 'unknown',
  187. ADD COLUMN IF NOT EXISTS MimeRecipients Array(String) AFTER RcptDomain,
  188. ADD COLUMN IF NOT EXISTS MessageId String AFTER MimeRecipients,
  189. ADD COLUMN IF NOT EXISTS ScanTimeReal UInt32 AFTER `Symbols.Options`,
  190. ADD COLUMN IF NOT EXISTS ScanTimeVirtual UInt32 AFTER ScanTimeReal]],
  191. -- Add aliases
  192. [[ALTER TABLE rspamd
  193. ADD COLUMN IF NOT EXISTS SMTPFrom ALIAS if(From = '', '', concat(FromUser, '@', From)),
  194. ADD COLUMN IF NOT EXISTS SMTPRcpt ALIAS if(RcptDomain = '', '', concat(RcptUser, '@', RcptDomain)),
  195. ADD COLUMN IF NOT EXISTS MIMEFrom ALIAS if(MimeFrom = '', '', concat(MimeUser, '@', MimeFrom)),
  196. ADD COLUMN IF NOT EXISTS MIMERcpt ALIAS MimeRecipients[1]
  197. ]],
  198. -- New version
  199. [[INSERT INTO rspamd_version (Version) Values (4)]],
  200. },
  201. [4] = {
  202. [[ALTER TABLE rspamd
  203. MODIFY COLUMN Action Enum8('reject' = 0, 'rewrite subject' = 1, 'add header' = 2, 'greylist' = 3, 'no action' = 4, 'soft reject' = 5, 'custom' = 6) DEFAULT 'no action',
  204. ADD COLUMN IF NOT EXISTS CustomAction String AFTER Action
  205. ]],
  206. -- New version
  207. [[INSERT INTO rspamd_version (Version) Values (5)]],
  208. },
  209. [5] = {
  210. [[ALTER TABLE rspamd
  211. ADD COLUMN IF NOT EXISTS AuthUser String AFTER ScanTimeVirtual,
  212. ADD COLUMN IF NOT EXISTS SettingsId LowCardinality(String) AFTER AuthUser
  213. ]],
  214. -- New version
  215. [[INSERT INTO rspamd_version (Version) Values (6)]],
  216. },
  217. [6] = {
  218. -- Add new columns
  219. [[ALTER TABLE rspamd
  220. ADD COLUMN IF NOT EXISTS Helo String AFTER IP,
  221. ADD COLUMN IF NOT EXISTS SMTPRecipients Array(String) AFTER RcptDomain
  222. ]],
  223. -- Modify SMTPRcpt alias
  224. [[
  225. ALTER TABLE rspamd
  226. MODIFY COLUMN SMTPRcpt ALIAS SMTPRecipients[1]
  227. ]],
  228. -- New version
  229. [[INSERT INTO rspamd_version (Version) Values (7)]],
  230. },
  231. [7] = {
  232. -- Add new columns
  233. [[ALTER TABLE rspamd
  234. ADD COLUMN IF NOT EXISTS `Groups.Names` Array(LowCardinality(String)) AFTER `Symbols.Options`,
  235. ADD COLUMN IF NOT EXISTS `Groups.Scores` Array(Float32) AFTER `Groups.Names`
  236. ]],
  237. -- New version
  238. [[INSERT INTO rspamd_version (Version) Values (8)]],
  239. },
  240. [8] = {
  241. -- Add new columns
  242. [[ALTER TABLE rspamd
  243. ADD COLUMN IF NOT EXISTS `Urls.Flags` Array(UInt32) AFTER `Urls.Url`
  244. ]],
  245. -- New version
  246. [[INSERT INTO rspamd_version (Version) Values (9)]],
  247. },
  248. }
  249. local predefined_actions = {
  250. ['reject'] = true,
  251. ['rewrite subject'] = true,
  252. ['add header'] = true,
  253. ['greylist'] = true,
  254. ['no action'] = true,
  255. ['soft reject'] = true
  256. }
  257. local function clickhouse_main_row(res)
  258. local fields = {
  259. 'Date',
  260. 'TS',
  261. 'From',
  262. 'MimeFrom',
  263. 'IP',
  264. 'Helo',
  265. 'Score',
  266. 'NRcpt',
  267. 'Size',
  268. 'IsWhitelist',
  269. 'IsBayes',
  270. 'IsFuzzy',
  271. 'IsFann',
  272. 'IsDkim',
  273. 'IsDmarc',
  274. 'NUrls',
  275. 'Action',
  276. 'FromUser',
  277. 'MimeUser',
  278. 'RcptUser',
  279. 'RcptDomain',
  280. 'SMTPRecipients',
  281. 'ListId',
  282. 'Subject',
  283. 'Digest',
  284. -- 1.9.2 +
  285. 'IsSpf',
  286. 'MimeRecipients',
  287. 'MessageId',
  288. 'ScanTimeReal',
  289. -- 1.9.3 +
  290. 'CustomAction',
  291. -- 2.0 +
  292. 'AuthUser',
  293. 'SettingsId',
  294. }
  295. for _,v in ipairs(fields) do table.insert(res, v) end
  296. end
  297. local function clickhouse_attachments_row(res)
  298. local fields = {
  299. 'Attachments.FileName',
  300. 'Attachments.ContentType',
  301. 'Attachments.Length',
  302. 'Attachments.Digest',
  303. }
  304. for _,v in ipairs(fields) do table.insert(res, v) end
  305. end
  306. local function clickhouse_urls_row(res)
  307. local fields = {
  308. 'Urls.Tld',
  309. 'Urls.Url',
  310. 'Urls.Flags',
  311. }
  312. for _,v in ipairs(fields) do table.insert(res, v) end
  313. end
  314. local function clickhouse_emails_row(res)
  315. local fields = {
  316. 'Emails',
  317. }
  318. for _,v in ipairs(fields) do table.insert(res, v) end
  319. end
  320. local function clickhouse_symbols_row(res)
  321. local fields = {
  322. 'Symbols.Names',
  323. 'Symbols.Scores',
  324. 'Symbols.Options',
  325. }
  326. for _,v in ipairs(fields) do table.insert(res, v) end
  327. end
  328. local function clickhouse_groups_row(res)
  329. local fields = {
  330. 'Groups.Names',
  331. 'Groups.Scores',
  332. }
  333. for _,v in ipairs(fields) do table.insert(res, v) end
  334. end
  335. local function clickhouse_asn_row(res)
  336. local fields = {
  337. 'ASN',
  338. 'Country',
  339. 'IPNet',
  340. }
  341. for _,v in ipairs(fields) do table.insert(res, v) end
  342. end
  343. local function clickhouse_extra_columns(res)
  344. for _,v in ipairs(settings.extra_columns) do table.insert(res, v.name) end
  345. end
  346. local function today(ts)
  347. return os.date('!%Y-%m-%d', ts)
  348. end
  349. local function clickhouse_check_symbol(task, settings_field_name, fields_table,
  350. field_name, value, value_negative)
  351. for _,s in ipairs(settings[settings_field_name] or {}) do
  352. if task:has_symbol(s) then
  353. if value_negative then
  354. local sym = task:get_symbol(s)[1]
  355. if sym['score'] > 0 then
  356. fields_table[field_name] = value
  357. else
  358. fields_table[field_name] = value_negative
  359. end
  360. else
  361. fields_table[field_name] = value
  362. end
  363. return true
  364. end
  365. end
  366. return false
  367. end
  368. local function clickhouse_send_data(task, ev_base, why, gen_rows, cust_rows)
  369. local log_object = task or rspamd_config
  370. local upstream = settings.upstream:get_upstream_round_robin()
  371. local ip_addr = upstream:get_addr():to_string(true)
  372. rspamd_logger.infox(log_object, "trying to send %s rows to clickhouse server %s; started as %s",
  373. #gen_rows + #cust_rows, ip_addr, why)
  374. local function gen_success_cb(what, how_many)
  375. return function (_, _)
  376. rspamd_logger.messagex(log_object, "sent %s rows of %s to clickhouse server %s; started as %s",
  377. how_many, what, ip_addr, why)
  378. upstream:ok()
  379. end
  380. end
  381. local function gen_fail_cb(what, how_many)
  382. return function (_, err)
  383. rspamd_logger.errx(log_object, "cannot send %s rows of %s data to clickhouse server %s: %s; started as %s",
  384. how_many, what, ip_addr, err, why)
  385. upstream:fail()
  386. end
  387. end
  388. local function send_data(what, tbl, query)
  389. local ch_params = {}
  390. if task then
  391. ch_params.task = task
  392. else
  393. ch_params.config = rspamd_config
  394. ch_params.ev_base = ev_base
  395. end
  396. local ret = lua_clickhouse.insert(upstream, settings, ch_params,
  397. query, tbl,
  398. gen_success_cb(what, #tbl),
  399. gen_fail_cb(what, #tbl))
  400. if not ret then
  401. rspamd_logger.errx(log_object, "cannot send %s rows of %s data to clickhouse server %s: %s",
  402. #tbl, what, ip_addr, 'cannot make HTTP request')
  403. end
  404. end
  405. local fields = {}
  406. clickhouse_main_row(fields)
  407. clickhouse_attachments_row(fields)
  408. clickhouse_urls_row(fields)
  409. clickhouse_emails_row(fields)
  410. clickhouse_asn_row(fields)
  411. if settings.enable_symbols then
  412. clickhouse_symbols_row(fields)
  413. clickhouse_groups_row(fields)
  414. end
  415. if #settings.extra_columns > 0 then
  416. clickhouse_extra_columns(fields)
  417. end
  418. send_data('generic data', gen_rows,
  419. string.format('INSERT INTO rspamd (%s)',
  420. table.concat(fields, ',')))
  421. for k,crows in pairs(cust_rows) do
  422. if #crows > 1 then
  423. send_data('custom data ('..k..')', crows,
  424. settings.custom_rules[k].first_row())
  425. end
  426. end
  427. end
  428. local function clickhouse_collect(task)
  429. if task:has_flag('skip') then
  430. return
  431. end
  432. if not settings.allow_local and lua_util.is_rspamc_or_controller(task) then
  433. return
  434. end
  435. for _,sym in ipairs(settings.stop_symbols) do
  436. if task:has_symbol(sym) then
  437. rspamd_logger.infox(task, 'skip Clickhouse storage for message: symbol %s has fired', sym)
  438. return
  439. end
  440. end
  441. if settings.exceptions then
  442. local excepted,trace = settings.exceptions:process(task)
  443. if excepted then
  444. rspamd_logger.infox(task, 'skipped Clickhouse storage for message: excepted (%s)',
  445. trace)
  446. -- Excepted
  447. return
  448. end
  449. end
  450. local from_domain = ''
  451. local from_user = ''
  452. if task:has_from('smtp') then
  453. local from = task:get_from({'smtp','orig'})[1]
  454. if from then
  455. from_domain = from['domain']:lower()
  456. from_user = from['user']
  457. end
  458. end
  459. local mime_domain = ''
  460. local mime_user = ''
  461. if task:has_from('mime') then
  462. local from = task:get_from({'mime','orig'})[1]
  463. if from then
  464. mime_domain = from['domain']:lower()
  465. mime_user = from['user']
  466. end
  467. end
  468. local mime_recipients = {}
  469. if task:has_recipients('mime') then
  470. local recipients = task:get_recipients({'mime','orig'})
  471. for _, rcpt in ipairs(recipients) do
  472. table.insert(mime_recipients, rcpt['user'] .. '@' .. rcpt['domain']:lower())
  473. end
  474. end
  475. local ip_str = 'undefined'
  476. local ip = task:get_from_ip()
  477. if ip and ip:is_valid() then
  478. local ipnet
  479. if ip:get_version() == 4 then
  480. ipnet = ip:apply_mask(settings['ipmask'])
  481. else
  482. ipnet = ip:apply_mask(settings['ipmask6'])
  483. end
  484. ip_str = ipnet:to_string()
  485. end
  486. local helo = task:get_helo() or ''
  487. local rcpt_user = ''
  488. local rcpt_domain = ''
  489. local smtp_recipients = {}
  490. if task:has_recipients('smtp') then
  491. local recipients = task:get_recipients('smtp')
  492. -- for compatibility with an old table structure
  493. rcpt_user = recipients[1]['user']
  494. rcpt_domain = recipients[1]['domain']:lower()
  495. for _, rcpt in ipairs(recipients) do
  496. table.insert(smtp_recipients, rcpt['user'] .. '@' .. rcpt['domain']:lower())
  497. end
  498. end
  499. local list_id = task:get_header('List-Id') or ''
  500. local message_id = lua_util.maybe_obfuscate_string(task:get_message_id() or '',
  501. settings, 'mid')
  502. local score = task:get_metric_score()[1];
  503. local fields = {
  504. bayes = 'unknown',
  505. fuzzy = 'unknown',
  506. ann = 'unknown',
  507. whitelist = 'unknown',
  508. dkim = 'unknown',
  509. dmarc = 'unknown',
  510. spf = 'unknown',
  511. }
  512. local ret
  513. ret = clickhouse_check_symbol(task,'bayes_spam_symbols', fields,
  514. 'bayes', 'spam')
  515. if not ret then
  516. clickhouse_check_symbol(task,'bayes_ham_symbols', fields,
  517. 'bayes', 'ham')
  518. end
  519. clickhouse_check_symbol(task,'ann_symbols_spam', fields,
  520. 'ann', 'spam')
  521. if not ret then
  522. clickhouse_check_symbol(task,'ann_symbols_ham', fields,
  523. 'ann', 'ham')
  524. end
  525. clickhouse_check_symbol(task,'whitelist_symbols', fields,
  526. 'whitelist', 'blacklist', 'whitelist')
  527. clickhouse_check_symbol(task,'fuzzy_symbols', fields,
  528. 'fuzzy', 'deny')
  529. ret = clickhouse_check_symbol(task,'dkim_allow_symbols', fields,
  530. 'dkim', 'allow')
  531. if not ret then
  532. ret = clickhouse_check_symbol(task,'dkim_reject_symbols', fields,
  533. 'dkim', 'reject')
  534. end
  535. if not ret then
  536. ret = clickhouse_check_symbol(task,'dkim_dnsfail_symbols', fields,
  537. 'dkim', 'dnsfail')
  538. end
  539. if not ret then
  540. clickhouse_check_symbol(task,'dkim_na_symbols', fields,
  541. 'dkim', 'na')
  542. end
  543. ret = clickhouse_check_symbol(task,'dmarc_allow_symbols', fields,
  544. 'dmarc', 'allow')
  545. if not ret then
  546. ret = clickhouse_check_symbol(task,'dmarc_reject_symbols', fields,
  547. 'dmarc', 'reject')
  548. end
  549. if not ret then
  550. ret = clickhouse_check_symbol(task,'dmarc_quarantine_symbols', fields,
  551. 'dmarc', 'quarantine')
  552. end
  553. if not ret then
  554. ret = clickhouse_check_symbol(task,'dmarc_softfail_symbols', fields,
  555. 'dmarc', 'softfail')
  556. end
  557. if not ret then
  558. clickhouse_check_symbol(task,'dmarc_na_symbols', fields,
  559. 'dmarc', 'na')
  560. end
  561. ret = clickhouse_check_symbol(task,'spf_allow_symbols', fields,
  562. 'spf', 'allow')
  563. if not ret then
  564. ret = clickhouse_check_symbol(task,'spf_reject_symbols', fields,
  565. 'spf', 'reject')
  566. end
  567. if not ret then
  568. ret = clickhouse_check_symbol(task,'spf_neutral_symbols', fields,
  569. 'spf', 'neutral')
  570. end
  571. if not ret then
  572. ret = clickhouse_check_symbol(task,'spf_dnsfail_symbols', fields,
  573. 'spf', 'dnsfail')
  574. end
  575. if not ret then
  576. clickhouse_check_symbol(task,'spf_na_symbols', fields,
  577. 'spf', 'na')
  578. end
  579. local nrcpts = 0
  580. if task:has_recipients('smtp') then
  581. nrcpts = #task:get_recipients('smtp')
  582. end
  583. local nurls = 0
  584. local task_urls = task:get_urls({
  585. content = true,
  586. images = true,
  587. emails = false,
  588. sort = true,
  589. }) or {}
  590. nurls = #task_urls
  591. local timestamp = math.floor(task:get_date({
  592. format = 'connect',
  593. gmt = true, -- The only sane way to sync stuff with different timezones
  594. }))
  595. local action = task:get_metric_action()
  596. local custom_action = ''
  597. if not predefined_actions[action] then
  598. custom_action = action
  599. action = 'custom'
  600. end
  601. local digest = ''
  602. if settings.enable_digest then
  603. digest = task:get_digest()
  604. end
  605. local subject = ''
  606. if settings.insert_subject then
  607. subject = lua_util.maybe_obfuscate_string(task:get_subject() or '', settings, 'subject')
  608. end
  609. local scan_real = task:get_scan_time()
  610. scan_real = math.floor(scan_real * 1000)
  611. if scan_real < 0 then
  612. rspamd_logger.messagex(task,
  613. 'clock skew detected for message: %s ms real scan time (reset to 0)',
  614. scan_real)
  615. scan_real = 0
  616. end
  617. local auth_user = task:get_user() or ''
  618. local settings_id = task:get_settings_id()
  619. if settings_id then
  620. -- Convert to string
  621. settings_id = lua_settings.settings_by_id(settings_id)
  622. if settings_id then
  623. settings_id = settings_id.name
  624. end
  625. end
  626. if not settings_id then
  627. settings_id = ''
  628. end
  629. local row = {
  630. today(timestamp),
  631. timestamp,
  632. from_domain,
  633. mime_domain,
  634. ip_str,
  635. helo,
  636. score,
  637. nrcpts,
  638. task:get_size(),
  639. fields.whitelist,
  640. fields.bayes,
  641. fields.fuzzy,
  642. fields.ann,
  643. fields.dkim,
  644. fields.dmarc,
  645. nurls,
  646. action,
  647. from_user,
  648. mime_user,
  649. rcpt_user,
  650. rcpt_domain,
  651. smtp_recipients,
  652. list_id,
  653. subject,
  654. digest,
  655. fields.spf,
  656. mime_recipients,
  657. message_id,
  658. scan_real,
  659. custom_action,
  660. auth_user,
  661. settings_id
  662. }
  663. -- Attachments step
  664. local attachments_fnames = {}
  665. local attachments_ctypes = {}
  666. local attachments_lengths = {}
  667. local attachments_digests = {}
  668. for _, part in ipairs(task:get_parts()) do
  669. if part:is_attachment() then
  670. table.insert(attachments_fnames, part:get_filename() or '')
  671. local mime_type, mime_subtype = part:get_type()
  672. table.insert(attachments_ctypes, string.format("%s/%s", mime_type, mime_subtype))
  673. table.insert(attachments_lengths, part:get_length())
  674. table.insert(attachments_digests, string.sub(part:get_digest(), 1, 16))
  675. end
  676. end
  677. if #attachments_fnames > 0 then
  678. table.insert(row, attachments_fnames)
  679. table.insert(row, attachments_ctypes)
  680. table.insert(row, attachments_lengths)
  681. table.insert(row, attachments_digests)
  682. else
  683. table.insert(row, {})
  684. table.insert(row, {})
  685. table.insert(row, {})
  686. table.insert(row, {})
  687. end
  688. -- Urls step
  689. local urls_urls = {}
  690. local urls_tlds = {}
  691. local urls_flags = {}
  692. if settings.full_urls then
  693. for i,u in ipairs(task_urls) do
  694. urls_urls[i] = u:get_text()
  695. urls_tlds[i] = u:get_tld() or u:get_host()
  696. urls_flags[i] = u:get_flags_num()
  697. end
  698. else
  699. -- We need to store unique
  700. local mt = {
  701. ord_tbl = {}, -- ordered list of urls
  702. idx_tbl = {}, -- indexed by host + flags, reference to an index in ord_tbl
  703. __newindex = function(t, k, v)
  704. local idx = getmetatable(t).idx_tbl
  705. local ord = getmetatable(t).ord_tbl
  706. local key = k:get_host() .. tostring(k:get_flags_num())
  707. if idx[key] then
  708. ord[idx[key]] = v -- replace
  709. else
  710. ord[#ord + 1] = v
  711. idx[key] = #ord
  712. end
  713. end,
  714. __index = function(t, k)
  715. local ord = getmetatable(t).ord_tbl
  716. if type(k) == 'number' then
  717. return ord[k]
  718. else
  719. local idx = getmetatable(t).idx_tbl
  720. local key = k:get_host() .. tostring(k:get_flags_num())
  721. if idx[key] then
  722. return ord[idx[key]]
  723. end
  724. end
  725. end,
  726. }
  727. -- Extra index needed for making this unique
  728. local urls_idx = {}
  729. setmetatable(urls_idx, mt)
  730. for _,u in ipairs(task_urls) do
  731. if not urls_idx[u] then
  732. urls_idx[u] = u
  733. urls_urls[#urls_urls + 1] = u:get_host()
  734. urls_tlds[#urls_tlds + 1] = u:get_tld() or u:get_host()
  735. urls_flags[#urls_flags + 1] = u:get_flags_num()
  736. end
  737. end
  738. end
  739. -- Get tlds
  740. table.insert(row, urls_tlds)
  741. -- Get hosts/full urls
  742. table.insert(row, urls_urls)
  743. -- Numeric flags
  744. table.insert(row, urls_flags)
  745. -- Emails step
  746. if task:has_urls(true) then
  747. local emails = task:get_emails() or {}
  748. local emails_formatted = {}
  749. for i,u in ipairs(emails) do
  750. emails_formatted[i] = string.format('%s@%s', u:get_user(), u:get_host())
  751. end
  752. table.insert(row, emails_formatted)
  753. else
  754. table.insert(row, {})
  755. end
  756. -- ASN information
  757. local asn, country, ipnet = 0, '--', '--'
  758. local pool = task:get_mempool()
  759. ret = pool:get_variable("asn")
  760. if ret then
  761. asn = ret
  762. end
  763. ret = pool:get_variable("country")
  764. if ret then
  765. country = ret:sub(1, 2)
  766. end
  767. ret = pool:get_variable("ipnet")
  768. if ret then
  769. ipnet = ret
  770. end
  771. table.insert(row, asn)
  772. table.insert(row, country)
  773. table.insert(row, ipnet)
  774. -- Symbols info
  775. if settings.enable_symbols then
  776. local symbols = task:get_symbols_all()
  777. local syms_tab = {}
  778. local scores_tab = {}
  779. local options_tab = {}
  780. for _,s in ipairs(symbols) do
  781. table.insert(syms_tab, s.name or '')
  782. table.insert(scores_tab, s.score)
  783. if s.options then
  784. table.insert(options_tab, table.concat(s.options, ','))
  785. else
  786. table.insert(options_tab, '');
  787. end
  788. end
  789. table.insert(row, syms_tab)
  790. table.insert(row, scores_tab)
  791. table.insert(row, options_tab)
  792. -- Groups data
  793. local groups = task:get_groups()
  794. local groups_tab = {}
  795. local gr_scores_tab = {}
  796. for gr,sc in pairs(groups) do
  797. table.insert(groups_tab, gr)
  798. table.insert(gr_scores_tab, sc)
  799. end
  800. table.insert(row, groups_tab)
  801. table.insert(row, gr_scores_tab)
  802. end
  803. -- Extra columns
  804. if #settings.extra_columns > 0 then
  805. for _,col in ipairs(settings.extra_columns) do
  806. local elts = col.real_selector(task)
  807. if elts then
  808. table.insert(row, elts)
  809. else
  810. table.insert(row, col.default_value)
  811. end
  812. end
  813. end
  814. -- Custom data
  815. for k,rule in pairs(settings.custom_rules) do
  816. if not custom_rows[k] then custom_rows[k] = {} end
  817. table.insert(custom_rows[k], lua_clickhouse.row_to_tsv(rule.get_row(task)))
  818. end
  819. local tsv_row = lua_clickhouse.row_to_tsv(row)
  820. used_memory = used_memory + #tsv_row
  821. data_rows[#data_rows + 1] = tsv_row
  822. nrows = nrows + 1
  823. lua_util.debugm(N, task,
  824. "add clickhouse row %s / %s; used memory: %s / %s",
  825. nrows, settings.limits.max_rows,
  826. used_memory, settings.limits.max_memory)
  827. end
  828. local function do_remove_partition(ev_base, cfg, table_name, partition)
  829. lua_util.debugm(N, rspamd_config, "removing partition %s.%s", table_name, partition)
  830. local upstream = settings.upstream:get_upstream_round_robin()
  831. local remove_partition_sql = "ALTER TABLE ${table_name} ${remove_method} PARTITION '${partition}'"
  832. local remove_method = (settings.retention.method == 'drop') and 'DROP' or 'DETACH'
  833. local sql_params = {
  834. ['table_name'] = table_name,
  835. ['remove_method'] = remove_method,
  836. ['partition'] = partition
  837. }
  838. local sql = lua_util.template(remove_partition_sql, sql_params)
  839. local ch_params = {
  840. body = sql,
  841. ev_base = ev_base,
  842. config = cfg,
  843. }
  844. local err, _ = lua_clickhouse.generic_sync(upstream, settings, ch_params, sql)
  845. if err then
  846. rspamd_logger.errx(rspamd_config,
  847. "cannot detach partition %s:%s from server %s: %s",
  848. table_name, partition,
  849. settings['server'], err)
  850. return
  851. end
  852. rspamd_logger.infox(rspamd_config,
  853. 'detached partition %s:%s on server %s', table_name, partition,
  854. settings['server'])
  855. end
  856. --[[
  857. nil - file is not writable, do not perform removal
  858. 0 - it's time to perform removal
  859. <int> - how many seconds wait until next run
  860. ]]
  861. local function get_last_removal_ago()
  862. local ts_file = string.format('%s/%s', rspamd_paths['DBDIR'], 'clickhouse_retention_run')
  863. local last_ts
  864. local current_ts = os.time()
  865. local function write_ts_to_file()
  866. local write_file, err = io.open(ts_file, 'w')
  867. if err then
  868. rspamd_logger.errx(rspamd_config, 'Failed to open %s, will not perform retention: %s', ts_file, err)
  869. return nil
  870. end
  871. local res
  872. res, err = write_file:write(tostring(current_ts))
  873. if err or res == nil then
  874. write_file:close()
  875. rspamd_logger.errx(rspamd_config, 'Failed to write %s, will not perform retention: %s', ts_file, err)
  876. return nil
  877. end
  878. write_file:close()
  879. return true
  880. end
  881. local f, err = io.open(ts_file, 'r')
  882. if err then
  883. lua_util.debugm(N, rspamd_config, 'Failed to open %s: %s', ts_file, err)
  884. else
  885. last_ts = tonumber(f:read('*number'))
  886. f:close()
  887. end
  888. if last_ts == nil or (last_ts + settings.retention.period) <= current_ts then
  889. return write_ts_to_file() and 0
  890. end
  891. if last_ts > current_ts then
  892. -- Clock skew detected, overwrite last_ts with current_ts and wait for the next
  893. -- retention period
  894. rspamd_logger.errx(rspamd_config, 'Last collection time is in future: %s; overwrite it with %s in %s',
  895. last_ts, current_ts, ts_file)
  896. return write_ts_to_file() and -1
  897. end
  898. return (last_ts + settings.retention.period) - current_ts
  899. end
  900. local function clickhouse_maybe_send_data_periodic(cfg, ev_base, now)
  901. local need_collect = false
  902. local reason
  903. if nrows == 0 then
  904. lua_util.debugm(N, cfg, "no need to send data, as there are no rows to collect")
  905. return settings.check_timeout
  906. end
  907. if final_call then
  908. lua_util.debugm(N, cfg, "no need to send data, final call has been issued")
  909. return 0
  910. end
  911. if settings.limits.max_rows > 0 then
  912. if nrows > settings.limits.max_rows then
  913. need_collect = true
  914. reason = string.format('limit of rows has been reached: %d', nrows)
  915. end
  916. end
  917. if last_collection > 0 and settings.limits.max_interval > 0 then
  918. if now - last_collection > settings.limits.max_interval then
  919. need_collect = true
  920. reason = string.format('limit of time since last collection has been reached: %d seconds passed ' ..
  921. '(%d seconds trigger)',
  922. (now - last_collection), settings.limits.max_interval)
  923. end
  924. end
  925. if settings.limits.max_memory > 0 then
  926. if used_memory >= settings.limits.max_memory then
  927. need_collect = true
  928. reason = string.format('limit of memory has been reached: %d bytes used',
  929. used_memory)
  930. end
  931. end
  932. if last_collection == 0 then
  933. last_collection = now
  934. end
  935. if need_collect then
  936. -- Do it atomic
  937. local saved_rows = data_rows
  938. local saved_custom = custom_rows
  939. nrows = 0
  940. last_collection = now
  941. used_memory = 0
  942. data_rows = {}
  943. custom_rows = {}
  944. clickhouse_send_data(nil, ev_base, reason, saved_rows, saved_custom)
  945. if settings.collect_garbage then
  946. collectgarbage()
  947. end
  948. end
  949. return settings.check_timeout
  950. end
  951. local function clickhouse_remove_old_partitions(cfg, ev_base)
  952. local last_time_ago = get_last_removal_ago()
  953. if last_time_ago == nil then
  954. rspamd_logger.errx(rspamd_config, "Failed to get last run time. Disabling retention")
  955. return false
  956. elseif last_time_ago ~= 0 then
  957. return last_time_ago
  958. end
  959. local upstream = settings.upstream:get_upstream_round_robin()
  960. local partition_to_remove_sql = "SELECT partition, table " ..
  961. "FROM system.parts WHERE table IN ('${tables}') " ..
  962. "GROUP BY partition, table " ..
  963. "HAVING max(max_date) < toDate(now() - interval ${month} month)"
  964. local table_names = {'rspamd'}
  965. local tables = table.concat(table_names, "', '")
  966. local sql_params = {
  967. tables = tables,
  968. month = settings.retention.period_months,
  969. }
  970. local sql = lua_util.template(partition_to_remove_sql, sql_params)
  971. local ch_params = {
  972. ev_base = ev_base,
  973. config = cfg,
  974. }
  975. local err, rows = lua_clickhouse.select_sync(upstream, settings, ch_params, sql)
  976. if err then
  977. rspamd_logger.errx(rspamd_config,
  978. "cannot send data to clickhouse server %s: %s",
  979. settings['server'], err)
  980. else
  981. fun.each(function(row)
  982. do_remove_partition(ev_base, cfg, row.table, row.partition)
  983. end, rows)
  984. end
  985. -- settings.retention.period is added on initialisation, see below
  986. return settings.retention.period
  987. end
  988. local function upload_clickhouse_schema(upstream, ev_base, cfg, initial)
  989. local ch_params = {
  990. ev_base = ev_base,
  991. config = cfg,
  992. }
  993. local errored = false
  994. -- Upload a single element of the schema
  995. local function upload_schema_elt(v)
  996. if errored then
  997. rspamd_logger.errx(rspamd_config, "cannot upload schema '%s' on clickhouse server %s: due to previous errors",
  998. v, upstream:get_addr():to_string(true))
  999. return
  1000. end
  1001. local sql = v
  1002. local err, reply = lua_clickhouse.generic_sync(upstream, settings, ch_params, sql)
  1003. if err then
  1004. rspamd_logger.errx(rspamd_config, "cannot upload schema '%s' on clickhouse server %s: %s",
  1005. sql, upstream:get_addr():to_string(true), err)
  1006. errored = true
  1007. return
  1008. end
  1009. rspamd_logger.debugm(N, rspamd_config, 'uploaded clickhouse schema element %s to %s: %s',
  1010. v, upstream:get_addr():to_string(true), reply)
  1011. end
  1012. -- Process element and return nil if statement should be skipped
  1013. local function preprocess_schema_elt(v)
  1014. if type(v) == 'string' then
  1015. return lua_util.template(v, {SCHEMA_VERSION = tostring(schema_version)})
  1016. elseif type(v) == 'table' then
  1017. -- Pair of statement + boolean
  1018. if initial == v[2] then
  1019. return lua_util.template(v[1], {SCHEMA_VERSION = tostring(schema_version)})
  1020. else
  1021. rspamd_logger.debugm(N, rspamd_config, 'skip clickhouse schema element %s: schema already exists',
  1022. v)
  1023. end
  1024. end
  1025. return nil
  1026. end
  1027. -- Apply schema elements sequentially, users additions are concatenated to the tail
  1028. fun.each(upload_schema_elt,
  1029. -- Also template schema version
  1030. fun.filter(function(v) return v ~= nil end,
  1031. fun.map(preprocess_schema_elt,
  1032. fun.chain(clickhouse_schema, settings.schema_additions)
  1033. )
  1034. )
  1035. )
  1036. end
  1037. local function maybe_apply_migrations(upstream, ev_base, cfg, version)
  1038. local ch_params = {
  1039. ev_base = ev_base,
  1040. config = cfg,
  1041. }
  1042. -- Apply migrations sequentially
  1043. local function migration_recursor(i)
  1044. if i < schema_version then
  1045. if migrations[i] then
  1046. -- We also need to apply statements sequentially
  1047. local function sql_recursor(j)
  1048. if migrations[i][j] then
  1049. local sql = migrations[i][j]
  1050. local ret = lua_clickhouse.generic(upstream, settings, ch_params, sql,
  1051. function(_, _)
  1052. rspamd_logger.infox(rspamd_config,
  1053. 'applied migration to version %s from version %s: %s',
  1054. i + 1, version, sql:gsub('[\n%s]+', ' '))
  1055. if j == #migrations[i] then
  1056. -- Go to the next migration
  1057. migration_recursor(i + 1)
  1058. else
  1059. -- Apply the next statement
  1060. sql_recursor(j + 1)
  1061. end
  1062. end ,
  1063. function(_, err)
  1064. rspamd_logger.errx(rspamd_config,
  1065. "cannot apply migration %s: '%s' on clickhouse server %s: %s",
  1066. i, sql, upstream:get_addr():to_string(true), err)
  1067. end)
  1068. if not ret then
  1069. rspamd_logger.errx(rspamd_config,
  1070. "cannot apply migration %s: '%s' on clickhouse server %s: cannot make request",
  1071. i, sql, upstream:get_addr():to_string(true))
  1072. end
  1073. end
  1074. end
  1075. sql_recursor(1)
  1076. else
  1077. -- Try another migration
  1078. migration_recursor(i + 1)
  1079. end
  1080. end
  1081. end
  1082. migration_recursor(version)
  1083. end
  1084. local function add_extra_columns(upstream, ev_base, cfg)
  1085. local ch_params = {
  1086. ev_base = ev_base,
  1087. config = cfg,
  1088. }
  1089. -- Apply migrations sequentially
  1090. local function columns_recursor(i)
  1091. if i <= #settings.extra_columns then
  1092. local col = settings.extra_columns[i]
  1093. local prev_column
  1094. if i == 1 then
  1095. prev_column = 'MIMERcpt'
  1096. else
  1097. prev_column = settings.extra_columns[i - 1].name
  1098. end
  1099. local sql = string.format('ALTER TABLE rspamd ADD COLUMN IF NOT EXISTS `%s` %s AFTER `%s`',
  1100. col.name, col.type, prev_column)
  1101. if col.comment then
  1102. sql = sql .. string.format(", COMMENT COLUMN IF EXISTS `%s` '%s'", col.name, col.comment)
  1103. end
  1104. local ret = lua_clickhouse.generic(upstream, settings, ch_params, sql,
  1105. function(_, _)
  1106. rspamd_logger.infox(rspamd_config,
  1107. 'added extra column %s (%s) after %s',
  1108. col.name, col.type, prev_column)
  1109. -- Apply the next statement
  1110. columns_recursor(i + 1)
  1111. end ,
  1112. function(_, err)
  1113. rspamd_logger.errx(rspamd_config,
  1114. "cannot apply add column alter %s: '%s' on clickhouse server %s: %s",
  1115. i, sql, upstream:get_addr():to_string(true), err)
  1116. end)
  1117. if not ret then
  1118. rspamd_logger.errx(rspamd_config,
  1119. "cannot apply add column alter %s: '%s' on clickhouse server %s: cannot make request",
  1120. i, sql, upstream:get_addr():to_string(true))
  1121. end
  1122. end
  1123. end
  1124. columns_recursor(1)
  1125. end
  1126. local function check_rspamd_table(upstream, ev_base, cfg)
  1127. local ch_params = {
  1128. ev_base = ev_base,
  1129. config = cfg,
  1130. }
  1131. local sql = [[EXISTS TABLE rspamd]]
  1132. local err, rows = lua_clickhouse.select_sync(upstream, settings, ch_params, sql)
  1133. if err then
  1134. rspamd_logger.errx(rspamd_config, "cannot check rspamd table in clickhouse server %s: %s",
  1135. upstream:get_addr():to_string(true), err)
  1136. return
  1137. end
  1138. if rows[1] and rows[1].result then
  1139. if tonumber(rows[1].result) == 1 then
  1140. -- Apply migration
  1141. upload_clickhouse_schema(upstream, ev_base, cfg, false)
  1142. rspamd_logger.infox(rspamd_config, 'table rspamd exists, check if we need to apply migrations')
  1143. maybe_apply_migrations(upstream, ev_base, cfg, 1)
  1144. else
  1145. -- Upload schema
  1146. rspamd_logger.infox(rspamd_config, 'table rspamd does not exists, upload full schema')
  1147. upload_clickhouse_schema(upstream, ev_base, cfg, true)
  1148. end
  1149. else
  1150. rspamd_logger.errx(rspamd_config,
  1151. "unexpected reply on EXISTS command from server %s: %s",
  1152. upstream:get_addr():to_string(true), rows)
  1153. end
  1154. end
  1155. local function check_clickhouse_upstream(upstream, ev_base, cfg)
  1156. local ch_params = {
  1157. ev_base = ev_base,
  1158. config = cfg,
  1159. }
  1160. -- If we have some custom rules, we just send its schema to the upstream
  1161. for k,rule in pairs(settings.custom_rules) do
  1162. if rule.schema then
  1163. local sql = lua_util.template(rule.schema, settings)
  1164. local err, _ = lua_clickhouse.generic_sync(upstream, settings, ch_params, sql)
  1165. if err then
  1166. rspamd_logger.errx(rspamd_config, 'cannot send custom schema %s to clickhouse server %s: ' ..
  1167. 'cannot make request (%s)',
  1168. k, upstream:get_addr():to_string(true), err)
  1169. end
  1170. end
  1171. end
  1172. -- Now check the main schema and apply migrations if needed
  1173. local sql = [[SELECT MAX(Version) as v FROM rspamd_version]]
  1174. local err, rows = lua_clickhouse.select_sync(upstream, settings, ch_params, sql)
  1175. if err then
  1176. if rows and rows.code == 404 then
  1177. rspamd_logger.infox(rspamd_config,
  1178. 'table rspamd_version does not exist, check rspamd table')
  1179. check_rspamd_table(upstream, ev_base, cfg)
  1180. else
  1181. rspamd_logger.errx(rspamd_config,
  1182. "cannot get version on clickhouse server %s: %s",
  1183. upstream:get_addr():to_string(true), err)
  1184. end
  1185. else
  1186. upload_clickhouse_schema(upstream, ev_base, cfg, false)
  1187. local version = tonumber(rows[1].v)
  1188. maybe_apply_migrations(upstream, ev_base, cfg, version)
  1189. end
  1190. if #settings.extra_columns > 0 then
  1191. add_extra_columns(upstream, ev_base, cfg)
  1192. end
  1193. end
  1194. local opts = rspamd_config:get_all_opt('clickhouse')
  1195. if opts then
  1196. -- Legacy `limit` options
  1197. if opts.limit and not opts.limits then
  1198. settings.limits.max_rows = opts.limit
  1199. end
  1200. for k,v in pairs(opts) do
  1201. if k == 'custom_rules' then
  1202. if not v[1] then
  1203. v = {v}
  1204. end
  1205. for i,rule in ipairs(v) do
  1206. if rule.schema and rule.first_row and rule.get_row then
  1207. local first_row, get_row
  1208. local loadstring = loadstring or load
  1209. local ret, res_or_err = pcall(loadstring(rule.first_row))
  1210. if not ret or type(res_or_err) ~= 'function' then
  1211. rspamd_logger.errx(rspamd_config, 'invalid first_row (%s) - must be a function',
  1212. res_or_err)
  1213. else
  1214. first_row = res_or_err
  1215. end
  1216. ret, res_or_err = pcall(loadstring(rule.get_row))
  1217. if not ret or type(res_or_err) ~= 'function' then
  1218. rspamd_logger.errx(rspamd_config,
  1219. 'invalid get_row (%s) - must be a function',
  1220. res_or_err)
  1221. else
  1222. get_row = res_or_err
  1223. end
  1224. if first_row and get_row then
  1225. local name = rule.name or tostring(i)
  1226. settings.custom_rules[name] = {
  1227. schema = rule.schema,
  1228. first_row = first_row,
  1229. get_row = get_row,
  1230. }
  1231. end
  1232. else
  1233. rspamd_logger.errx(rspamd_config, 'custom rule has no required attributes: schema, first_row and get_row')
  1234. end
  1235. end
  1236. else
  1237. settings[k] = lua_util.deepcopy(v)
  1238. end
  1239. end
  1240. if not settings['server'] and not settings['servers'] then
  1241. rspamd_logger.infox(rspamd_config, 'no servers are specified, disabling module')
  1242. lua_util.disable_module(N, "config")
  1243. else
  1244. local lua_maps = require "lua_maps"
  1245. settings['from_map'] = lua_maps.map_add('clickhouse', 'from_tables',
  1246. 'regexp', 'clickhouse specific domains')
  1247. settings.upstream = upstream_list.create(rspamd_config,
  1248. settings['server'] or settings['servers'], 8123)
  1249. if not settings.upstream then
  1250. rspamd_logger.errx(rspamd_config, 'cannot parse clickhouse address: %s',
  1251. settings['server'] or settings['servers'])
  1252. lua_util.disable_module(N, "config")
  1253. return
  1254. end
  1255. if settings.exceptions then
  1256. local maps_expressions = require "lua_maps_expressions"
  1257. settings.exceptions = maps_expressions.create(rspamd_config,
  1258. settings.exceptions, N)
  1259. end
  1260. if settings.extra_columns then
  1261. -- Check sanity and create selector closures
  1262. local lua_selectors = require "lua_selectors"
  1263. local columns_transformed = {}
  1264. local need_sort = false
  1265. -- Select traverse function depending on what we have
  1266. local iter_func = settings.extra_columns[1] and ipairs or pairs
  1267. for col_name,col_data in iter_func(settings.extra_columns) do
  1268. -- Array based extra columns
  1269. if col_data.name then col_name = col_data.name end
  1270. if not col_data.selector or not col_data.type then
  1271. rspamd_logger.errx(rspamd_config, 'cannot add clickhouse extra row %s: no type or no selector',
  1272. col_name)
  1273. else
  1274. local is_array = false
  1275. if col_data.type:lower():match('^array') then
  1276. is_array = true
  1277. end
  1278. local selector = lua_selectors.create_selector_closure(rspamd_config,
  1279. col_data.selector, col_data.delimiter or '', is_array)
  1280. if not selector then
  1281. rspamd_logger.errx(rspamd_config, 'cannot add clickhouse extra row %s: bad selector: %s',
  1282. col_name, col_data.selector)
  1283. else
  1284. if not col_data.default_value then
  1285. if is_array then
  1286. col_data.default_value = {}
  1287. else
  1288. col_data.default_value = ''
  1289. end
  1290. end
  1291. col_data.real_selector = selector
  1292. if not col_data.name then
  1293. col_data.name = col_name
  1294. need_sort = true
  1295. end
  1296. table.insert(columns_transformed, col_data)
  1297. end
  1298. end
  1299. end
  1300. -- Convert extra columns from a map to an array sorted by column name to
  1301. -- preserve strict order when doing altering
  1302. if need_sort then
  1303. rspamd_logger.infox(rspamd_config, 'sort extra columns as they are not configured as an array')
  1304. table.sort(columns_transformed, function(c1, c2) return c1.name < c2.name end)
  1305. end
  1306. settings.extra_columns = columns_transformed
  1307. end
  1308. rspamd_config:register_symbol({
  1309. name = 'CLICKHOUSE_COLLECT',
  1310. type = 'idempotent',
  1311. callback = clickhouse_collect,
  1312. flags = 'empty,explicit_disable,ignore_passthrough',
  1313. augmentations = {string.format("timeout=%f", settings.timeout)},
  1314. })
  1315. rspamd_config:register_finish_script(function(task)
  1316. if nrows > 0 then
  1317. final_call = true
  1318. local saved_rows = data_rows
  1319. local saved_custom = custom_rows
  1320. nrows = 0
  1321. data_rows = {}
  1322. used_memory = 0
  1323. custom_rows = {}
  1324. clickhouse_send_data(task, nil, 'final collection',
  1325. saved_rows, saved_custom)
  1326. if settings.collect_garbage then
  1327. collectgarbage()
  1328. end
  1329. end
  1330. end)
  1331. -- Create tables on load
  1332. rspamd_config:add_on_load(function(cfg, ev_base, worker)
  1333. if worker:is_scanner() then
  1334. rspamd_config:add_periodic(ev_base, 0,
  1335. clickhouse_maybe_send_data_periodic, true)
  1336. end
  1337. if worker:is_primary_controller() then
  1338. local upstreams = settings.upstream:all_upstreams()
  1339. for _,up in ipairs(upstreams) do
  1340. check_clickhouse_upstream(up, ev_base, cfg)
  1341. end
  1342. if settings.retention.enable and settings.retention.method ~= 'drop' and
  1343. settings.retention.method ~= 'detach' then
  1344. rspamd_logger.errx(rspamd_config,
  1345. "retention.method should be either 'drop' or 'detach' (now: %s). Disabling retention",
  1346. settings.retention.method)
  1347. settings.retention.enable = false
  1348. end
  1349. if settings.retention.enable and settings.retention.period_months < 1 or
  1350. settings.retention.period_months > 1000 then
  1351. rspamd_logger.errx(rspamd_config,
  1352. "please, set retention.period_months between 1 and 1000 (now: %s). Disabling retention",
  1353. settings.retention.period_months)
  1354. settings.retention.enable = false
  1355. end
  1356. local period = lua_util.parse_time_interval(settings.retention.run_every)
  1357. if settings.retention.enable and period == nil then
  1358. rspamd_logger.errx(rspamd_config, "invalid value for retention.run_every (%s). Disabling retention",
  1359. settings.retention.run_every)
  1360. settings.retention.enable = false
  1361. end
  1362. if settings.retention.enable then
  1363. settings.retention.period = period
  1364. rspamd_logger.infox(rspamd_config,
  1365. "retention will be performed each %s seconds for %s month with method %s",
  1366. period, settings.retention.period_months, settings.retention.method)
  1367. rspamd_config:add_periodic(ev_base, 0, clickhouse_remove_old_partitions, false)
  1368. end
  1369. end
  1370. end)
  1371. end
  1372. end