You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

clickhouse.lua 50KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. local rspamd_logger = require 'rspamd_logger'
  14. local upstream_list = require "rspamd_upstream_list"
  15. local lua_util = require "lua_util"
  16. local lua_clickhouse = require "lua_clickhouse"
  17. local lua_settings = require "lua_settings"
  18. local fun = require "fun"
  19. local N = "clickhouse"
  20. if confighelp then
  21. return
  22. end
  23. local data_rows = {}
  24. local custom_rows = {}
  25. local nrows = 0
  26. local used_memory = 0
  27. local last_collection = 0
  28. local final_call = false -- If the final collection has been started
  29. local schema_version = 9 -- Current schema version
  30. local settings = {
  31. limits = { -- Collection limits
  32. max_rows = 1000, -- How many rows are allowed (0 for disable this)
  33. max_memory = 50 * 1024 * 1024, -- How many memory should be occupied before sending collection
  34. max_interval = 60, -- Maximum collection interval
  35. },
  36. collect_garbage = false, -- Perform GC collection after sending the data
  37. check_timeout = 10.0, -- Periodic timeout
  38. timeout = 5.0,
  39. bayes_spam_symbols = { 'BAYES_SPAM' },
  40. bayes_ham_symbols = { 'BAYES_HAM' },
  41. ann_symbols_spam = { 'NEURAL_SPAM' },
  42. ann_symbols_ham = { 'NEURAL_HAM' },
  43. fuzzy_symbols = { 'FUZZY_DENIED' },
  44. whitelist_symbols = { 'WHITELIST_DKIM', 'WHITELIST_SPF_DKIM', 'WHITELIST_DMARC' },
  45. dkim_allow_symbols = { 'R_DKIM_ALLOW' },
  46. dkim_reject_symbols = { 'R_DKIM_REJECT' },
  47. dkim_dnsfail_symbols = { 'R_DKIM_TEMPFAIL', 'R_DKIM_PERMFAIL' },
  48. dkim_na_symbols = { 'R_DKIM_NA' },
  49. dmarc_allow_symbols = { 'DMARC_POLICY_ALLOW' },
  50. dmarc_reject_symbols = { 'DMARC_POLICY_REJECT' },
  51. dmarc_quarantine_symbols = { 'DMARC_POLICY_QUARANTINE' },
  52. dmarc_softfail_symbols = { 'DMARC_POLICY_SOFTFAIL' },
  53. dmarc_na_symbols = { 'DMARC_NA' },
  54. spf_allow_symbols = { 'R_SPF_ALLOW' },
  55. spf_reject_symbols = { 'R_SPF_FAIL' },
  56. spf_dnsfail_symbols = { 'R_SPF_DNSFAIL', 'R_SPF_PERMFAIL' },
  57. spf_neutral_symbols = { 'R_DKIM_TEMPFAIL', 'R_DKIM_PERMFAIL' },
  58. spf_na_symbols = { 'R_SPF_NA' },
  59. stop_symbols = {},
  60. ipmask = 19,
  61. ipmask6 = 48,
  62. full_urls = false,
  63. from_tables = nil,
  64. enable_symbols = false,
  65. database = 'default',
  66. use_https = false,
  67. use_gzip = true,
  68. allow_local = false,
  69. insert_subject = false,
  70. subject_privacy = false, -- subject privacy is off
  71. subject_privacy_alg = 'blake2', -- default hash-algorithm to obfuscate subject
  72. subject_privacy_prefix = 'obf', -- prefix to show it's obfuscated
  73. subject_privacy_length = 16, -- cut the length of the hash
  74. schema_additions = {}, -- additional SQL statements to be executed when schema is uploaded
  75. user = nil,
  76. password = nil,
  77. no_ssl_verify = false,
  78. custom_rules = {},
  79. enable_digest = false,
  80. exceptions = nil,
  81. retention = {
  82. enable = false,
  83. method = 'detach',
  84. period_months = 3,
  85. run_every = '7d',
  86. },
  87. extra_columns = {},
  88. }
  89. --- @language SQL
  90. local clickhouse_schema = { [[
  91. CREATE TABLE IF NOT EXISTS rspamd
  92. (
  93. Date Date COMMENT 'Date (used for partitioning)',
  94. TS DateTime COMMENT 'Date and time of the request start (UTC)',
  95. From String COMMENT 'Domain part of the return address (RFC5321.MailFrom)',
  96. MimeFrom String COMMENT 'Domain part of the address in From: header (RFC5322.From)',
  97. IP String COMMENT 'SMTP client IP as provided by MTA or from Received: header',
  98. Helo String COMMENT 'Full hostname as sent by the SMTP client (RFC5321.HELO/.EHLO)',
  99. Score Float32 COMMENT 'Message score',
  100. NRcpt UInt8 COMMENT 'Number of envelope recipients (RFC5321.RcptTo)',
  101. Size UInt32 COMMENT 'Message size in bytes',
  102. IsWhitelist Enum8('blacklist' = 0, 'whitelist' = 1, 'unknown' = 2) DEFAULT 'unknown' COMMENT 'Based on symbols configured in `whitelist_symbols` module option',
  103. IsBayes Enum8('ham' = 0, 'spam' = 1, 'unknown' = 2) DEFAULT 'unknown' COMMENT 'Based on symbols configured in `bayes_spam_symbols` and `bayes_ham_symbols` module options',
  104. IsFuzzy Enum8('whitelist' = 0, 'deny' = 1, 'unknown' = 2) DEFAULT 'unknown' COMMENT 'Based on symbols configured in `fuzzy_symbols` module option',
  105. IsFann Enum8('ham' = 0, 'spam' = 1, 'unknown' = 2) DEFAULT 'unknown' COMMENT 'Based on symbols configured in `ann_symbols_spam` and `ann_symbols_ham` module options',
  106. IsDkim Enum8('reject' = 0, 'allow' = 1, 'unknown' = 2, 'dnsfail' = 3, 'na' = 4) DEFAULT 'unknown' COMMENT 'Based on symbols configured in dkim_* module options',
  107. IsDmarc Enum8('reject' = 0, 'allow' = 1, 'unknown' = 2, 'softfail' = 3, 'na' = 4, 'quarantine' = 5) DEFAULT 'unknown' COMMENT 'Based on symbols configured in dmarc_* module options',
  108. IsSpf Enum8('reject' = 0, 'allow' = 1, 'neutral' = 2, 'dnsfail' = 3, 'na' = 4, 'unknown' = 5) DEFAULT 'unknown' COMMENT 'Based on symbols configured in spf_* module options',
  109. NUrls Int32 COMMENT 'Number of URLs and email extracted from the message',
  110. Action Enum8('reject' = 0, 'rewrite subject' = 1, 'add header' = 2, 'greylist' = 3, 'no action' = 4, 'soft reject' = 5, 'custom' = 6) DEFAULT 'no action' COMMENT 'Action returned for the message; if action is not predefined actual action will be in `CustomAction` field',
  111. CustomAction LowCardinality(String) COMMENT 'Action string for custom action',
  112. FromUser String COMMENT 'Local part of the return address (RFC5321.MailFrom)',
  113. MimeUser String COMMENT 'Local part of the address in From: header (RFC5322.From)',
  114. RcptUser String COMMENT '[Deprecated] Local part of the first envelope recipient (RFC5321.RcptTo)',
  115. RcptDomain String COMMENT '[Deprecated] Domain part of the first envelope recipient (RFC5321.RcptTo)',
  116. SMTPRecipients Array(String) COMMENT 'List of envelope recipients (RFC5321.RcptTo)',
  117. MimeRecipients Array(String) COMMENT 'List of recipients from headers (RFC5322.To/.CC/.BCC)',
  118. MessageId String COMMENT 'Message-ID header',
  119. ListId String COMMENT 'List-Id header',
  120. Subject String COMMENT 'Subject header (or hash if `subject_privacy` module option enabled)',
  121. `Attachments.FileName` Array(String) COMMENT 'Attachment name',
  122. `Attachments.ContentType` Array(String) COMMENT 'Attachment Content-Type',
  123. `Attachments.Length` Array(UInt32) COMMENT 'Attachment size in bytes',
  124. `Attachments.Digest` Array(FixedString(16)) COMMENT 'First 16 characters of hash returned by mime_part:get_digest()',
  125. `Urls.Tld` Array(String) COMMENT 'Effective second level domain part of the URL host',
  126. `Urls.Url` Array(String) COMMENT 'Full URL if `full_urls` module option enabled, host part of URL otherwise',
  127. `Urls.Flags` Array(UInt32) COMMENT 'Corresponding url flags, see `enum rspamd_url_flags` in libserver/url.h for details',
  128. Emails Array(String) COMMENT 'List of emails extracted from the message',
  129. ASN UInt32 COMMENT 'BGP AS number for SMTP client IP (returned by asn.rspamd.com or asn6.rspamd.com)',
  130. Country FixedString(2) COMMENT 'Country for SMTP client IP (returned by asn.rspamd.com or asn6.rspamd.com)',
  131. IPNet String,
  132. `Symbols.Names` Array(LowCardinality(String)) COMMENT 'Symbol name',
  133. `Symbols.Scores` Array(Float32) COMMENT 'Symbol score',
  134. `Symbols.Options` Array(String) COMMENT 'Symbol options (comma separated list)',
  135. `Groups.Names` Array(LowCardinality(String)) COMMENT 'Group name',
  136. `Groups.Scores` Array(Float32) COMMENT 'Group score',
  137. ScanTimeReal UInt32 COMMENT 'Request time in milliseconds',
  138. ScanTimeVirtual UInt32 COMMENT 'Deprecated do not use',
  139. AuthUser String COMMENT 'Username for authenticated SMTP client',
  140. SettingsId LowCardinality(String) COMMENT 'ID for the settings profile',
  141. Digest FixedString(32) COMMENT '[Deprecated]',
  142. SMTPFrom ALIAS if(From = '', '', concat(FromUser, '@', From)) COMMENT 'Return address (RFC5321.MailFrom)',
  143. SMTPRcpt ALIAS SMTPRecipients[1] COMMENT 'The first envelope recipient (RFC5321.RcptTo)',
  144. MIMEFrom ALIAS if(MimeFrom = '', '', concat(MimeUser, '@', MimeFrom)) COMMENT 'Address in From: header (RFC5322.From)',
  145. MIMERcpt ALIAS MimeRecipients[1] COMMENT 'The first recipient from headers (RFC5322.To/.CC/.BCC)'
  146. ) ENGINE = MergeTree()
  147. PARTITION BY toMonday(Date)
  148. ORDER BY TS
  149. ]],
  150. [[CREATE TABLE IF NOT EXISTS rspamd_version ( Version UInt32) ENGINE = TinyLog]],
  151. { [[INSERT INTO rspamd_version (Version) Values (${SCHEMA_VERSION})]], true },
  152. }
  153. -- This describes SQL queries to migrate between versions
  154. local migrations = {
  155. [1] = {
  156. -- Move to a wide fat table
  157. [[ALTER TABLE rspamd
  158. ADD COLUMN IF NOT EXISTS `Attachments.FileName` Array(String) AFTER ListId,
  159. ADD COLUMN IF NOT EXISTS `Attachments.ContentType` Array(String) AFTER `Attachments.FileName`,
  160. ADD COLUMN IF NOT EXISTS `Attachments.Length` Array(UInt32) AFTER `Attachments.ContentType`,
  161. ADD COLUMN IF NOT EXISTS `Attachments.Digest` Array(FixedString(16)) AFTER `Attachments.Length`,
  162. ADD COLUMN IF NOT EXISTS `Urls.Tld` Array(String) AFTER `Attachments.Digest`,
  163. ADD COLUMN IF NOT EXISTS `Urls.Url` Array(String) AFTER `Urls.Tld`,
  164. ADD COLUMN IF NOT EXISTS Emails Array(String) AFTER `Urls.Url`,
  165. ADD COLUMN IF NOT EXISTS ASN UInt32 AFTER Emails,
  166. ADD COLUMN IF NOT EXISTS Country FixedString(2) AFTER ASN,
  167. ADD COLUMN IF NOT EXISTS IPNet String AFTER Country,
  168. ADD COLUMN IF NOT EXISTS `Symbols.Names` Array(String) AFTER IPNet,
  169. ADD COLUMN IF NOT EXISTS `Symbols.Scores` Array(Float64) AFTER `Symbols.Names`,
  170. ADD COLUMN IF NOT EXISTS `Symbols.Options` Array(String) AFTER `Symbols.Scores`]],
  171. -- Add explicit version
  172. [[CREATE TABLE rspamd_version ( Version UInt32) ENGINE = TinyLog]],
  173. [[INSERT INTO rspamd_version (Version) Values (2)]],
  174. },
  175. [2] = {
  176. -- Add `Subject` column
  177. [[ALTER TABLE rspamd
  178. ADD COLUMN IF NOT EXISTS Subject String AFTER ListId]],
  179. -- New version
  180. [[INSERT INTO rspamd_version (Version) Values (3)]],
  181. },
  182. [3] = {
  183. [[ALTER TABLE rspamd
  184. ADD COLUMN IF NOT EXISTS IsSpf Enum8('reject' = 0, 'allow' = 1, 'neutral' = 2, 'dnsfail' = 3, 'na' = 4, 'unknown' = 5) DEFAULT 'unknown' AFTER IsDmarc,
  185. MODIFY COLUMN IsDkim Enum8('reject' = 0, 'allow' = 1, 'unknown' = 2, 'dnsfail' = 3, 'na' = 4) DEFAULT 'unknown',
  186. MODIFY COLUMN IsDmarc Enum8('reject' = 0, 'allow' = 1, 'unknown' = 2, 'softfail' = 3, 'na' = 4, 'quarantine' = 5) DEFAULT 'unknown',
  187. ADD COLUMN IF NOT EXISTS MimeRecipients Array(String) AFTER RcptDomain,
  188. ADD COLUMN IF NOT EXISTS MessageId String AFTER MimeRecipients,
  189. ADD COLUMN IF NOT EXISTS ScanTimeReal UInt32 AFTER `Symbols.Options`,
  190. ADD COLUMN IF NOT EXISTS ScanTimeVirtual UInt32 AFTER ScanTimeReal]],
  191. -- Add aliases
  192. [[ALTER TABLE rspamd
  193. ADD COLUMN IF NOT EXISTS SMTPFrom ALIAS if(From = '', '', concat(FromUser, '@', From)),
  194. ADD COLUMN IF NOT EXISTS SMTPRcpt ALIAS if(RcptDomain = '', '', concat(RcptUser, '@', RcptDomain)),
  195. ADD COLUMN IF NOT EXISTS MIMEFrom ALIAS if(MimeFrom = '', '', concat(MimeUser, '@', MimeFrom)),
  196. ADD COLUMN IF NOT EXISTS MIMERcpt ALIAS MimeRecipients[1]
  197. ]],
  198. -- New version
  199. [[INSERT INTO rspamd_version (Version) Values (4)]],
  200. },
  201. [4] = {
  202. [[ALTER TABLE rspamd
  203. MODIFY COLUMN Action Enum8('reject' = 0, 'rewrite subject' = 1, 'add header' = 2, 'greylist' = 3, 'no action' = 4, 'soft reject' = 5, 'custom' = 6) DEFAULT 'no action',
  204. ADD COLUMN IF NOT EXISTS CustomAction String AFTER Action
  205. ]],
  206. -- New version
  207. [[INSERT INTO rspamd_version (Version) Values (5)]],
  208. },
  209. [5] = {
  210. [[ALTER TABLE rspamd
  211. ADD COLUMN IF NOT EXISTS AuthUser String AFTER ScanTimeVirtual,
  212. ADD COLUMN IF NOT EXISTS SettingsId LowCardinality(String) AFTER AuthUser
  213. ]],
  214. -- New version
  215. [[INSERT INTO rspamd_version (Version) Values (6)]],
  216. },
  217. [6] = {
  218. -- Add new columns
  219. [[ALTER TABLE rspamd
  220. ADD COLUMN IF NOT EXISTS Helo String AFTER IP,
  221. ADD COLUMN IF NOT EXISTS SMTPRecipients Array(String) AFTER RcptDomain
  222. ]],
  223. -- Modify SMTPRcpt alias
  224. [[
  225. ALTER TABLE rspamd
  226. MODIFY COLUMN SMTPRcpt ALIAS SMTPRecipients[1]
  227. ]],
  228. -- New version
  229. [[INSERT INTO rspamd_version (Version) Values (7)]],
  230. },
  231. [7] = {
  232. -- Add new columns
  233. [[ALTER TABLE rspamd
  234. ADD COLUMN IF NOT EXISTS `Groups.Names` Array(LowCardinality(String)) AFTER `Symbols.Options`,
  235. ADD COLUMN IF NOT EXISTS `Groups.Scores` Array(Float32) AFTER `Groups.Names`
  236. ]],
  237. -- New version
  238. [[INSERT INTO rspamd_version (Version) Values (8)]],
  239. },
  240. [8] = {
  241. -- Add new columns
  242. [[ALTER TABLE rspamd
  243. ADD COLUMN IF NOT EXISTS `Urls.Flags` Array(UInt32) AFTER `Urls.Url`
  244. ]],
  245. -- New version
  246. [[INSERT INTO rspamd_version (Version) Values (9)]],
  247. },
  248. }
  249. local predefined_actions = {
  250. ['reject'] = true,
  251. ['rewrite subject'] = true,
  252. ['add header'] = true,
  253. ['greylist'] = true,
  254. ['no action'] = true,
  255. ['soft reject'] = true
  256. }
  257. local function clickhouse_main_row(res)
  258. local fields = {
  259. 'Date',
  260. 'TS',
  261. 'From',
  262. 'MimeFrom',
  263. 'IP',
  264. 'Helo',
  265. 'Score',
  266. 'NRcpt',
  267. 'Size',
  268. 'IsWhitelist',
  269. 'IsBayes',
  270. 'IsFuzzy',
  271. 'IsFann',
  272. 'IsDkim',
  273. 'IsDmarc',
  274. 'NUrls',
  275. 'Action',
  276. 'FromUser',
  277. 'MimeUser',
  278. 'RcptUser',
  279. 'RcptDomain',
  280. 'SMTPRecipients',
  281. 'ListId',
  282. 'Subject',
  283. 'Digest',
  284. -- 1.9.2 +
  285. 'IsSpf',
  286. 'MimeRecipients',
  287. 'MessageId',
  288. 'ScanTimeReal',
  289. -- 1.9.3 +
  290. 'CustomAction',
  291. -- 2.0 +
  292. 'AuthUser',
  293. 'SettingsId',
  294. }
  295. for _, v in ipairs(fields) do
  296. table.insert(res, v)
  297. end
  298. end
  299. local function clickhouse_attachments_row(res)
  300. local fields = {
  301. 'Attachments.FileName',
  302. 'Attachments.ContentType',
  303. 'Attachments.Length',
  304. 'Attachments.Digest',
  305. }
  306. for _, v in ipairs(fields) do
  307. table.insert(res, v)
  308. end
  309. end
  310. local function clickhouse_urls_row(res)
  311. local fields = {
  312. 'Urls.Tld',
  313. 'Urls.Url',
  314. 'Urls.Flags',
  315. }
  316. for _, v in ipairs(fields) do
  317. table.insert(res, v)
  318. end
  319. end
  320. local function clickhouse_emails_row(res)
  321. local fields = {
  322. 'Emails',
  323. }
  324. for _, v in ipairs(fields) do
  325. table.insert(res, v)
  326. end
  327. end
  328. local function clickhouse_symbols_row(res)
  329. local fields = {
  330. 'Symbols.Names',
  331. 'Symbols.Scores',
  332. 'Symbols.Options',
  333. }
  334. for _, v in ipairs(fields) do
  335. table.insert(res, v)
  336. end
  337. end
  338. local function clickhouse_groups_row(res)
  339. local fields = {
  340. 'Groups.Names',
  341. 'Groups.Scores',
  342. }
  343. for _, v in ipairs(fields) do
  344. table.insert(res, v)
  345. end
  346. end
  347. local function clickhouse_asn_row(res)
  348. local fields = {
  349. 'ASN',
  350. 'Country',
  351. 'IPNet',
  352. }
  353. for _, v in ipairs(fields) do
  354. table.insert(res, v)
  355. end
  356. end
  357. local function clickhouse_extra_columns(res)
  358. for _, v in ipairs(settings.extra_columns) do
  359. table.insert(res, v.name)
  360. end
  361. end
  362. local function today(ts)
  363. return os.date('!%Y-%m-%d', ts)
  364. end
  365. local function clickhouse_check_symbol(task, settings_field_name, fields_table,
  366. field_name, value, value_negative)
  367. for _, s in ipairs(settings[settings_field_name] or {}) do
  368. if task:has_symbol(s) then
  369. if value_negative then
  370. local sym = task:get_symbol(s)[1]
  371. if sym['score'] > 0 then
  372. fields_table[field_name] = value
  373. else
  374. fields_table[field_name] = value_negative
  375. end
  376. else
  377. fields_table[field_name] = value
  378. end
  379. return true
  380. end
  381. end
  382. return false
  383. end
  384. local function clickhouse_send_data(task, ev_base, why, gen_rows, cust_rows)
  385. local log_object = task or rspamd_config
  386. local upstream = settings.upstream:get_upstream_round_robin()
  387. local ip_addr = upstream:get_addr():to_string(true)
  388. rspamd_logger.infox(log_object, "trying to send %s rows to clickhouse server %s; started as %s",
  389. #gen_rows + #cust_rows, ip_addr, why)
  390. local function gen_success_cb(what, how_many)
  391. return function(_, _)
  392. rspamd_logger.messagex(log_object, "sent %s rows of %s to clickhouse server %s; started as %s",
  393. how_many, what, ip_addr, why)
  394. upstream:ok()
  395. end
  396. end
  397. local function gen_fail_cb(what, how_many)
  398. return function(_, err)
  399. rspamd_logger.errx(log_object, "cannot send %s rows of %s data to clickhouse server %s: %s; started as %s",
  400. how_many, what, ip_addr, err, why)
  401. upstream:fail()
  402. end
  403. end
  404. local function send_data(what, tbl, query)
  405. local ch_params = {}
  406. if task then
  407. ch_params.task = task
  408. else
  409. ch_params.config = rspamd_config
  410. ch_params.ev_base = ev_base
  411. end
  412. local ret = lua_clickhouse.insert(upstream, settings, ch_params,
  413. query, tbl,
  414. gen_success_cb(what, #tbl),
  415. gen_fail_cb(what, #tbl))
  416. if not ret then
  417. rspamd_logger.errx(log_object, "cannot send %s rows of %s data to clickhouse server %s: %s",
  418. #tbl, what, ip_addr, 'cannot make HTTP request')
  419. end
  420. end
  421. local fields = {}
  422. clickhouse_main_row(fields)
  423. clickhouse_attachments_row(fields)
  424. clickhouse_urls_row(fields)
  425. clickhouse_emails_row(fields)
  426. clickhouse_asn_row(fields)
  427. if settings.enable_symbols then
  428. clickhouse_symbols_row(fields)
  429. clickhouse_groups_row(fields)
  430. end
  431. if #settings.extra_columns > 0 then
  432. clickhouse_extra_columns(fields)
  433. end
  434. send_data('generic data', gen_rows,
  435. string.format('INSERT INTO rspamd (%s)',
  436. table.concat(fields, ',')))
  437. for k, crows in pairs(cust_rows) do
  438. if #crows > 1 then
  439. send_data('custom data (' .. k .. ')', crows,
  440. settings.custom_rules[k].first_row())
  441. end
  442. end
  443. end
  444. local function clickhouse_collect(task)
  445. if task:has_flag('skip') then
  446. return
  447. end
  448. if not settings.allow_local and lua_util.is_rspamc_or_controller(task) then
  449. return
  450. end
  451. for _, sym in ipairs(settings.stop_symbols) do
  452. if task:has_symbol(sym) then
  453. rspamd_logger.infox(task, 'skip Clickhouse storage for message: symbol %s has fired', sym)
  454. return
  455. end
  456. end
  457. if settings.exceptions then
  458. local excepted, trace = settings.exceptions:process(task)
  459. if excepted then
  460. rspamd_logger.infox(task, 'skipped Clickhouse storage for message: excepted (%s)',
  461. trace)
  462. -- Excepted
  463. return
  464. end
  465. end
  466. local from_domain = ''
  467. local from_user = ''
  468. if task:has_from('smtp') then
  469. local from = task:get_from({ 'smtp', 'orig' })[1]
  470. if from then
  471. from_domain = from['domain']:lower()
  472. from_user = from['user']
  473. end
  474. end
  475. local mime_domain = ''
  476. local mime_user = ''
  477. if task:has_from('mime') then
  478. local from = task:get_from({ 'mime', 'orig' })[1]
  479. if from then
  480. mime_domain = from['domain']:lower()
  481. mime_user = from['user']
  482. end
  483. end
  484. local mime_recipients = {}
  485. if task:has_recipients('mime') then
  486. local recipients = task:get_recipients({ 'mime', 'orig' })
  487. for _, rcpt in ipairs(recipients) do
  488. table.insert(mime_recipients, rcpt['user'] .. '@' .. rcpt['domain']:lower())
  489. end
  490. end
  491. local ip_str = 'undefined'
  492. local ip = task:get_from_ip()
  493. if ip and ip:is_valid() then
  494. local ipnet
  495. if ip:get_version() == 4 then
  496. ipnet = ip:apply_mask(settings['ipmask'])
  497. else
  498. ipnet = ip:apply_mask(settings['ipmask6'])
  499. end
  500. ip_str = ipnet:to_string()
  501. end
  502. local helo = task:get_helo() or ''
  503. local rcpt_user = ''
  504. local rcpt_domain = ''
  505. local smtp_recipients = {}
  506. if task:has_recipients('smtp') then
  507. local recipients = task:get_recipients('smtp')
  508. -- for compatibility with an old table structure
  509. rcpt_user = recipients[1]['user']
  510. rcpt_domain = recipients[1]['domain']:lower()
  511. for _, rcpt in ipairs(recipients) do
  512. table.insert(smtp_recipients, rcpt['user'] .. '@' .. rcpt['domain']:lower())
  513. end
  514. end
  515. local list_id = task:get_header('List-Id') or ''
  516. local message_id = lua_util.maybe_obfuscate_string(task:get_message_id() or '',
  517. settings, 'mid')
  518. local score = task:get_metric_score()[1];
  519. local fields = {
  520. bayes = 'unknown',
  521. fuzzy = 'unknown',
  522. ann = 'unknown',
  523. whitelist = 'unknown',
  524. dkim = 'unknown',
  525. dmarc = 'unknown',
  526. spf = 'unknown',
  527. }
  528. local ret
  529. ret = clickhouse_check_symbol(task, 'bayes_spam_symbols', fields,
  530. 'bayes', 'spam')
  531. if not ret then
  532. clickhouse_check_symbol(task, 'bayes_ham_symbols', fields,
  533. 'bayes', 'ham')
  534. end
  535. clickhouse_check_symbol(task, 'ann_symbols_spam', fields,
  536. 'ann', 'spam')
  537. if not ret then
  538. clickhouse_check_symbol(task, 'ann_symbols_ham', fields,
  539. 'ann', 'ham')
  540. end
  541. clickhouse_check_symbol(task, 'whitelist_symbols', fields,
  542. 'whitelist', 'blacklist', 'whitelist')
  543. clickhouse_check_symbol(task, 'fuzzy_symbols', fields,
  544. 'fuzzy', 'deny')
  545. ret = clickhouse_check_symbol(task, 'dkim_allow_symbols', fields,
  546. 'dkim', 'allow')
  547. if not ret then
  548. ret = clickhouse_check_symbol(task, 'dkim_reject_symbols', fields,
  549. 'dkim', 'reject')
  550. end
  551. if not ret then
  552. ret = clickhouse_check_symbol(task, 'dkim_dnsfail_symbols', fields,
  553. 'dkim', 'dnsfail')
  554. end
  555. if not ret then
  556. clickhouse_check_symbol(task, 'dkim_na_symbols', fields,
  557. 'dkim', 'na')
  558. end
  559. ret = clickhouse_check_symbol(task, 'dmarc_allow_symbols', fields,
  560. 'dmarc', 'allow')
  561. if not ret then
  562. ret = clickhouse_check_symbol(task, 'dmarc_reject_symbols', fields,
  563. 'dmarc', 'reject')
  564. end
  565. if not ret then
  566. ret = clickhouse_check_symbol(task, 'dmarc_quarantine_symbols', fields,
  567. 'dmarc', 'quarantine')
  568. end
  569. if not ret then
  570. ret = clickhouse_check_symbol(task, 'dmarc_softfail_symbols', fields,
  571. 'dmarc', 'softfail')
  572. end
  573. if not ret then
  574. clickhouse_check_symbol(task, 'dmarc_na_symbols', fields,
  575. 'dmarc', 'na')
  576. end
  577. ret = clickhouse_check_symbol(task, 'spf_allow_symbols', fields,
  578. 'spf', 'allow')
  579. if not ret then
  580. ret = clickhouse_check_symbol(task, 'spf_reject_symbols', fields,
  581. 'spf', 'reject')
  582. end
  583. if not ret then
  584. ret = clickhouse_check_symbol(task, 'spf_neutral_symbols', fields,
  585. 'spf', 'neutral')
  586. end
  587. if not ret then
  588. ret = clickhouse_check_symbol(task, 'spf_dnsfail_symbols', fields,
  589. 'spf', 'dnsfail')
  590. end
  591. if not ret then
  592. clickhouse_check_symbol(task, 'spf_na_symbols', fields,
  593. 'spf', 'na')
  594. end
  595. local nrcpts = 0
  596. if task:has_recipients('smtp') then
  597. nrcpts = #task:get_recipients('smtp')
  598. end
  599. local nurls = 0
  600. local task_urls = task:get_urls({
  601. content = true,
  602. images = true,
  603. emails = false,
  604. sort = true,
  605. }) or {}
  606. nurls = #task_urls
  607. local timestamp = math.floor(task:get_date({
  608. format = 'connect',
  609. gmt = true, -- The only sane way to sync stuff with different timezones
  610. }))
  611. local action = task:get_metric_action()
  612. local custom_action = ''
  613. if not predefined_actions[action] then
  614. custom_action = action
  615. action = 'custom'
  616. end
  617. local digest = ''
  618. if settings.enable_digest then
  619. digest = task:get_digest()
  620. end
  621. local subject = ''
  622. if settings.insert_subject then
  623. subject = lua_util.maybe_obfuscate_string(task:get_subject() or '', settings, 'subject')
  624. end
  625. local scan_real = task:get_scan_time()
  626. scan_real = math.floor(scan_real * 1000)
  627. if scan_real < 0 then
  628. rspamd_logger.messagex(task,
  629. 'clock skew detected for message: %s ms real scan time (reset to 0)',
  630. scan_real)
  631. scan_real = 0
  632. end
  633. local auth_user = task:get_user() or ''
  634. local settings_id = task:get_settings_id()
  635. if settings_id then
  636. -- Convert to string
  637. settings_id = lua_settings.settings_by_id(settings_id)
  638. if settings_id then
  639. settings_id = settings_id.name
  640. end
  641. end
  642. if not settings_id then
  643. settings_id = ''
  644. end
  645. local row = {
  646. today(timestamp),
  647. timestamp,
  648. from_domain,
  649. mime_domain,
  650. ip_str,
  651. helo,
  652. score,
  653. nrcpts,
  654. task:get_size(),
  655. fields.whitelist,
  656. fields.bayes,
  657. fields.fuzzy,
  658. fields.ann,
  659. fields.dkim,
  660. fields.dmarc,
  661. nurls,
  662. action,
  663. from_user,
  664. mime_user,
  665. rcpt_user,
  666. rcpt_domain,
  667. smtp_recipients,
  668. list_id,
  669. subject,
  670. digest,
  671. fields.spf,
  672. mime_recipients,
  673. message_id,
  674. scan_real,
  675. custom_action,
  676. auth_user,
  677. settings_id
  678. }
  679. -- Attachments step
  680. local attachments_fnames = {}
  681. local attachments_ctypes = {}
  682. local attachments_lengths = {}
  683. local attachments_digests = {}
  684. for _, part in ipairs(task:get_parts()) do
  685. if part:is_attachment() then
  686. table.insert(attachments_fnames, part:get_filename() or '')
  687. local mime_type, mime_subtype = part:get_type()
  688. table.insert(attachments_ctypes, string.format("%s/%s", mime_type, mime_subtype))
  689. table.insert(attachments_lengths, part:get_length())
  690. table.insert(attachments_digests, string.sub(part:get_digest(), 1, 16))
  691. end
  692. end
  693. if #attachments_fnames > 0 then
  694. table.insert(row, attachments_fnames)
  695. table.insert(row, attachments_ctypes)
  696. table.insert(row, attachments_lengths)
  697. table.insert(row, attachments_digests)
  698. else
  699. table.insert(row, {})
  700. table.insert(row, {})
  701. table.insert(row, {})
  702. table.insert(row, {})
  703. end
  704. -- Urls step
  705. local urls_urls = {}
  706. local urls_tlds = {}
  707. local urls_flags = {}
  708. if settings.full_urls then
  709. for i, u in ipairs(task_urls) do
  710. urls_urls[i] = u:get_text()
  711. urls_tlds[i] = u:get_tld() or u:get_host()
  712. urls_flags[i] = u:get_flags_num()
  713. end
  714. else
  715. -- We need to store unique
  716. local mt = {
  717. ord_tbl = {}, -- ordered list of urls
  718. idx_tbl = {}, -- indexed by host + flags, reference to an index in ord_tbl
  719. __newindex = function(t, k, v)
  720. local idx = getmetatable(t).idx_tbl
  721. local ord = getmetatable(t).ord_tbl
  722. local key = k:get_host() .. tostring(k:get_flags_num())
  723. if idx[key] then
  724. ord[idx[key]] = v -- replace
  725. else
  726. ord[#ord + 1] = v
  727. idx[key] = #ord
  728. end
  729. end,
  730. __index = function(t, k)
  731. local ord = getmetatable(t).ord_tbl
  732. if type(k) == 'number' then
  733. return ord[k]
  734. else
  735. local idx = getmetatable(t).idx_tbl
  736. local key = k:get_host() .. tostring(k:get_flags_num())
  737. if idx[key] then
  738. return ord[idx[key]]
  739. end
  740. end
  741. end,
  742. }
  743. -- Extra index needed for making this unique
  744. local urls_idx = {}
  745. setmetatable(urls_idx, mt)
  746. for _, u in ipairs(task_urls) do
  747. if not urls_idx[u] then
  748. urls_idx[u] = u
  749. urls_urls[#urls_urls + 1] = u:get_host()
  750. urls_tlds[#urls_tlds + 1] = u:get_tld() or u:get_host()
  751. urls_flags[#urls_flags + 1] = u:get_flags_num()
  752. end
  753. end
  754. end
  755. -- Get tlds
  756. table.insert(row, urls_tlds)
  757. -- Get hosts/full urls
  758. table.insert(row, urls_urls)
  759. -- Numeric flags
  760. table.insert(row, urls_flags)
  761. -- Emails step
  762. if task:has_urls(true) then
  763. local emails = task:get_emails() or {}
  764. local emails_formatted = {}
  765. for i, u in ipairs(emails) do
  766. emails_formatted[i] = string.format('%s@%s', u:get_user(), u:get_host())
  767. end
  768. table.insert(row, emails_formatted)
  769. else
  770. table.insert(row, {})
  771. end
  772. -- ASN information
  773. local asn, country, ipnet = 0, '--', '--'
  774. local pool = task:get_mempool()
  775. ret = pool:get_variable("asn")
  776. if ret then
  777. asn = ret
  778. end
  779. ret = pool:get_variable("country")
  780. if ret then
  781. country = ret:sub(1, 2)
  782. end
  783. ret = pool:get_variable("ipnet")
  784. if ret then
  785. ipnet = ret
  786. end
  787. table.insert(row, asn)
  788. table.insert(row, country)
  789. table.insert(row, ipnet)
  790. -- Symbols info
  791. if settings.enable_symbols then
  792. local symbols = task:get_symbols_all()
  793. local syms_tab = {}
  794. local scores_tab = {}
  795. local options_tab = {}
  796. for _, s in ipairs(symbols) do
  797. table.insert(syms_tab, s.name or '')
  798. table.insert(scores_tab, s.score)
  799. if s.options then
  800. table.insert(options_tab, table.concat(s.options, ','))
  801. else
  802. table.insert(options_tab, '');
  803. end
  804. end
  805. table.insert(row, syms_tab)
  806. table.insert(row, scores_tab)
  807. table.insert(row, options_tab)
  808. -- Groups data
  809. local groups = task:get_groups()
  810. local groups_tab = {}
  811. local gr_scores_tab = {}
  812. for gr, sc in pairs(groups) do
  813. table.insert(groups_tab, gr)
  814. table.insert(gr_scores_tab, sc)
  815. end
  816. table.insert(row, groups_tab)
  817. table.insert(row, gr_scores_tab)
  818. end
  819. -- Extra columns
  820. if #settings.extra_columns > 0 then
  821. for _, col in ipairs(settings.extra_columns) do
  822. local elts = col.real_selector(task)
  823. if elts then
  824. table.insert(row, elts)
  825. else
  826. table.insert(row, col.default_value)
  827. end
  828. end
  829. end
  830. -- Custom data
  831. for k, rule in pairs(settings.custom_rules) do
  832. if not custom_rows[k] then
  833. custom_rows[k] = {}
  834. end
  835. table.insert(custom_rows[k], lua_clickhouse.row_to_tsv(rule.get_row(task)))
  836. end
  837. local tsv_row = lua_clickhouse.row_to_tsv(row)
  838. used_memory = used_memory + #tsv_row
  839. data_rows[#data_rows + 1] = tsv_row
  840. nrows = nrows + 1
  841. lua_util.debugm(N, task,
  842. "add clickhouse row %s / %s; used memory: %s / %s",
  843. nrows, settings.limits.max_rows,
  844. used_memory, settings.limits.max_memory)
  845. end
  846. local function do_remove_partition(ev_base, cfg, table_name, partition)
  847. lua_util.debugm(N, rspamd_config, "removing partition %s.%s", table_name, partition)
  848. local upstream = settings.upstream:get_upstream_round_robin()
  849. local remove_partition_sql = "ALTER TABLE ${table_name} ${remove_method} PARTITION '${partition}'"
  850. local remove_method = (settings.retention.method == 'drop') and 'DROP' or 'DETACH'
  851. local sql_params = {
  852. ['table_name'] = table_name,
  853. ['remove_method'] = remove_method,
  854. ['partition'] = partition
  855. }
  856. local sql = lua_util.template(remove_partition_sql, sql_params)
  857. local ch_params = {
  858. body = sql,
  859. ev_base = ev_base,
  860. config = cfg,
  861. }
  862. local err, _ = lua_clickhouse.generic_sync(upstream, settings, ch_params, sql)
  863. if err then
  864. rspamd_logger.errx(rspamd_config,
  865. "cannot detach partition %s:%s from server %s: %s",
  866. table_name, partition,
  867. settings['server'], err)
  868. return
  869. end
  870. rspamd_logger.infox(rspamd_config,
  871. 'detached partition %s:%s on server %s', table_name, partition,
  872. settings['server'])
  873. end
  874. --[[
  875. nil - file is not writable, do not perform removal
  876. 0 - it's time to perform removal
  877. <int> - how many seconds wait until next run
  878. ]]
  879. local function get_last_removal_ago()
  880. local ts_file = string.format('%s/%s', rspamd_paths['DBDIR'], 'clickhouse_retention_run')
  881. local last_ts
  882. local current_ts = os.time()
  883. local function write_ts_to_file()
  884. local write_file, err = io.open(ts_file, 'w')
  885. if err then
  886. rspamd_logger.errx(rspamd_config, 'Failed to open %s, will not perform retention: %s', ts_file, err)
  887. return nil
  888. end
  889. local res
  890. res, err = write_file:write(tostring(current_ts))
  891. if err or res == nil then
  892. write_file:close()
  893. rspamd_logger.errx(rspamd_config, 'Failed to write %s, will not perform retention: %s', ts_file, err)
  894. return nil
  895. end
  896. write_file:close()
  897. return true
  898. end
  899. local f, err = io.open(ts_file, 'r')
  900. if err then
  901. lua_util.debugm(N, rspamd_config, 'Failed to open %s: %s', ts_file, err)
  902. else
  903. last_ts = tonumber(f:read('*number'))
  904. f:close()
  905. end
  906. if last_ts == nil or (last_ts + settings.retention.period) <= current_ts then
  907. return write_ts_to_file() and 0
  908. end
  909. if last_ts > current_ts then
  910. -- Clock skew detected, overwrite last_ts with current_ts and wait for the next
  911. -- retention period
  912. rspamd_logger.errx(rspamd_config, 'Last collection time is in future: %s; overwrite it with %s in %s',
  913. last_ts, current_ts, ts_file)
  914. return write_ts_to_file() and -1
  915. end
  916. return (last_ts + settings.retention.period) - current_ts
  917. end
  918. local function clickhouse_maybe_send_data_periodic(cfg, ev_base, now)
  919. local need_collect = false
  920. local reason
  921. if nrows == 0 then
  922. lua_util.debugm(N, cfg, "no need to send data, as there are no rows to collect")
  923. return settings.check_timeout
  924. end
  925. if final_call then
  926. lua_util.debugm(N, cfg, "no need to send data, final call has been issued")
  927. return 0
  928. end
  929. if settings.limits.max_rows > 0 then
  930. if nrows > settings.limits.max_rows then
  931. need_collect = true
  932. reason = string.format('limit of rows has been reached: %d', nrows)
  933. end
  934. end
  935. if last_collection > 0 and settings.limits.max_interval > 0 then
  936. if now - last_collection > settings.limits.max_interval then
  937. need_collect = true
  938. reason = string.format('limit of time since last collection has been reached: %d seconds passed ' ..
  939. '(%d seconds trigger)',
  940. (now - last_collection), settings.limits.max_interval)
  941. end
  942. end
  943. if settings.limits.max_memory > 0 then
  944. if used_memory >= settings.limits.max_memory then
  945. need_collect = true
  946. reason = string.format('limit of memory has been reached: %d bytes used',
  947. used_memory)
  948. end
  949. end
  950. if last_collection == 0 then
  951. last_collection = now
  952. end
  953. if need_collect then
  954. -- Do it atomic
  955. local saved_rows = data_rows
  956. local saved_custom = custom_rows
  957. nrows = 0
  958. last_collection = now
  959. used_memory = 0
  960. data_rows = {}
  961. custom_rows = {}
  962. clickhouse_send_data(nil, ev_base, reason, saved_rows, saved_custom)
  963. if settings.collect_garbage then
  964. collectgarbage()
  965. end
  966. end
  967. return settings.check_timeout
  968. end
  969. local function clickhouse_remove_old_partitions(cfg, ev_base)
  970. local last_time_ago = get_last_removal_ago()
  971. if last_time_ago == nil then
  972. rspamd_logger.errx(rspamd_config, "Failed to get last run time. Disabling retention")
  973. return false
  974. elseif last_time_ago ~= 0 then
  975. return last_time_ago
  976. end
  977. local upstream = settings.upstream:get_upstream_round_robin()
  978. local partition_to_remove_sql = "SELECT partition, table " ..
  979. "FROM system.parts WHERE table IN ('${tables}') " ..
  980. "GROUP BY partition, table " ..
  981. "HAVING max(max_date) < toDate(now() - interval ${month} month)"
  982. local table_names = { 'rspamd' }
  983. local tables = table.concat(table_names, "', '")
  984. local sql_params = {
  985. tables = tables,
  986. month = settings.retention.period_months,
  987. }
  988. local sql = lua_util.template(partition_to_remove_sql, sql_params)
  989. local ch_params = {
  990. ev_base = ev_base,
  991. config = cfg,
  992. }
  993. local err, rows = lua_clickhouse.select_sync(upstream, settings, ch_params, sql)
  994. if err then
  995. rspamd_logger.errx(rspamd_config,
  996. "cannot send data to clickhouse server %s: %s",
  997. settings['server'], err)
  998. else
  999. fun.each(function(row)
  1000. do_remove_partition(ev_base, cfg, row.table, row.partition)
  1001. end, rows)
  1002. end
  1003. -- settings.retention.period is added on initialisation, see below
  1004. return settings.retention.period
  1005. end
  1006. local function upload_clickhouse_schema(upstream, ev_base, cfg, initial)
  1007. local ch_params = {
  1008. ev_base = ev_base,
  1009. config = cfg,
  1010. }
  1011. local errored = false
  1012. -- Upload a single element of the schema
  1013. local function upload_schema_elt(v)
  1014. if errored then
  1015. rspamd_logger.errx(rspamd_config, "cannot upload schema '%s' on clickhouse server %s: due to previous errors",
  1016. v, upstream:get_addr():to_string(true))
  1017. return
  1018. end
  1019. local sql = v
  1020. local err, reply = lua_clickhouse.generic_sync(upstream, settings, ch_params, sql)
  1021. if err then
  1022. rspamd_logger.errx(rspamd_config, "cannot upload schema '%s' on clickhouse server %s: %s",
  1023. sql, upstream:get_addr():to_string(true), err)
  1024. errored = true
  1025. return
  1026. end
  1027. rspamd_logger.debugm(N, rspamd_config, 'uploaded clickhouse schema element %s to %s: %s',
  1028. v, upstream:get_addr():to_string(true), reply)
  1029. end
  1030. -- Process element and return nil if statement should be skipped
  1031. local function preprocess_schema_elt(v)
  1032. if type(v) == 'string' then
  1033. return lua_util.template(v, { SCHEMA_VERSION = tostring(schema_version) })
  1034. elseif type(v) == 'table' then
  1035. -- Pair of statement + boolean
  1036. if initial == v[2] then
  1037. return lua_util.template(v[1], { SCHEMA_VERSION = tostring(schema_version) })
  1038. else
  1039. rspamd_logger.debugm(N, rspamd_config, 'skip clickhouse schema element %s: schema already exists',
  1040. v)
  1041. end
  1042. end
  1043. return nil
  1044. end
  1045. -- Apply schema elements sequentially, users additions are concatenated to the tail
  1046. fun.each(upload_schema_elt,
  1047. -- Also template schema version
  1048. fun.filter(function(v)
  1049. return v ~= nil
  1050. end,
  1051. fun.map(preprocess_schema_elt,
  1052. fun.chain(clickhouse_schema, settings.schema_additions)
  1053. )
  1054. )
  1055. )
  1056. end
  1057. local function maybe_apply_migrations(upstream, ev_base, cfg, version)
  1058. local ch_params = {
  1059. ev_base = ev_base,
  1060. config = cfg,
  1061. }
  1062. -- Apply migrations sequentially
  1063. local function migration_recursor(i)
  1064. if i < schema_version then
  1065. if migrations[i] then
  1066. -- We also need to apply statements sequentially
  1067. local function sql_recursor(j)
  1068. if migrations[i][j] then
  1069. local sql = migrations[i][j]
  1070. local ret = lua_clickhouse.generic(upstream, settings, ch_params, sql,
  1071. function(_, _)
  1072. rspamd_logger.infox(rspamd_config,
  1073. 'applied migration to version %s from version %s: %s',
  1074. i + 1, version, sql:gsub('[\n%s]+', ' '))
  1075. if j == #migrations[i] then
  1076. -- Go to the next migration
  1077. migration_recursor(i + 1)
  1078. else
  1079. -- Apply the next statement
  1080. sql_recursor(j + 1)
  1081. end
  1082. end,
  1083. function(_, err)
  1084. rspamd_logger.errx(rspamd_config,
  1085. "cannot apply migration %s: '%s' on clickhouse server %s: %s",
  1086. i, sql, upstream:get_addr():to_string(true), err)
  1087. end)
  1088. if not ret then
  1089. rspamd_logger.errx(rspamd_config,
  1090. "cannot apply migration %s: '%s' on clickhouse server %s: cannot make request",
  1091. i, sql, upstream:get_addr():to_string(true))
  1092. end
  1093. end
  1094. end
  1095. sql_recursor(1)
  1096. else
  1097. -- Try another migration
  1098. migration_recursor(i + 1)
  1099. end
  1100. end
  1101. end
  1102. migration_recursor(version)
  1103. end
  1104. local function add_extra_columns(upstream, ev_base, cfg)
  1105. local ch_params = {
  1106. ev_base = ev_base,
  1107. config = cfg,
  1108. }
  1109. -- Apply migrations sequentially
  1110. local function columns_recursor(i)
  1111. if i <= #settings.extra_columns then
  1112. local col = settings.extra_columns[i]
  1113. local prev_column
  1114. if i == 1 then
  1115. prev_column = 'MIMERcpt'
  1116. else
  1117. prev_column = settings.extra_columns[i - 1].name
  1118. end
  1119. local sql = string.format('ALTER TABLE rspamd ADD COLUMN IF NOT EXISTS `%s` %s AFTER `%s`',
  1120. col.name, col.type, prev_column)
  1121. if col.comment then
  1122. sql = sql .. string.format(", COMMENT COLUMN IF EXISTS `%s` '%s'", col.name, col.comment)
  1123. end
  1124. local ret = lua_clickhouse.generic(upstream, settings, ch_params, sql,
  1125. function(_, _)
  1126. rspamd_logger.infox(rspamd_config,
  1127. 'added extra column %s (%s) after %s',
  1128. col.name, col.type, prev_column)
  1129. -- Apply the next statement
  1130. columns_recursor(i + 1)
  1131. end,
  1132. function(_, err)
  1133. rspamd_logger.errx(rspamd_config,
  1134. "cannot apply add column alter %s: '%s' on clickhouse server %s: %s",
  1135. i, sql, upstream:get_addr():to_string(true), err)
  1136. end)
  1137. if not ret then
  1138. rspamd_logger.errx(rspamd_config,
  1139. "cannot apply add column alter %s: '%s' on clickhouse server %s: cannot make request",
  1140. i, sql, upstream:get_addr():to_string(true))
  1141. end
  1142. end
  1143. end
  1144. columns_recursor(1)
  1145. end
  1146. local function check_rspamd_table(upstream, ev_base, cfg)
  1147. local ch_params = {
  1148. ev_base = ev_base,
  1149. config = cfg,
  1150. }
  1151. local sql = [[EXISTS TABLE rspamd]]
  1152. local err, rows = lua_clickhouse.select_sync(upstream, settings, ch_params, sql)
  1153. if err then
  1154. rspamd_logger.errx(rspamd_config, "cannot check rspamd table in clickhouse server %s: %s",
  1155. upstream:get_addr():to_string(true), err)
  1156. return
  1157. end
  1158. if rows[1] and rows[1].result then
  1159. if tonumber(rows[1].result) == 1 then
  1160. -- Apply migration
  1161. upload_clickhouse_schema(upstream, ev_base, cfg, false)
  1162. rspamd_logger.infox(rspamd_config, 'table rspamd exists, check if we need to apply migrations')
  1163. maybe_apply_migrations(upstream, ev_base, cfg, 1)
  1164. else
  1165. -- Upload schema
  1166. rspamd_logger.infox(rspamd_config, 'table rspamd does not exists, upload full schema')
  1167. upload_clickhouse_schema(upstream, ev_base, cfg, true)
  1168. end
  1169. else
  1170. rspamd_logger.errx(rspamd_config,
  1171. "unexpected reply on EXISTS command from server %s: %s",
  1172. upstream:get_addr():to_string(true), rows)
  1173. end
  1174. end
  1175. local function check_clickhouse_upstream(upstream, ev_base, cfg)
  1176. local ch_params = {
  1177. ev_base = ev_base,
  1178. config = cfg,
  1179. }
  1180. -- If we have some custom rules, we just send its schema to the upstream
  1181. for k, rule in pairs(settings.custom_rules) do
  1182. if rule.schema then
  1183. local sql = lua_util.template(rule.schema, settings)
  1184. local err, _ = lua_clickhouse.generic_sync(upstream, settings, ch_params, sql)
  1185. if err then
  1186. rspamd_logger.errx(rspamd_config, 'cannot send custom schema %s to clickhouse server %s: ' ..
  1187. 'cannot make request (%s)',
  1188. k, upstream:get_addr():to_string(true), err)
  1189. end
  1190. end
  1191. end
  1192. -- Now check the main schema and apply migrations if needed
  1193. local sql = [[SELECT MAX(Version) as v FROM rspamd_version]]
  1194. local err, rows = lua_clickhouse.select_sync(upstream, settings, ch_params, sql)
  1195. if err then
  1196. if rows and rows.code == 404 then
  1197. rspamd_logger.infox(rspamd_config,
  1198. 'table rspamd_version does not exist, check rspamd table')
  1199. check_rspamd_table(upstream, ev_base, cfg)
  1200. else
  1201. rspamd_logger.errx(rspamd_config,
  1202. "cannot get version on clickhouse server %s: %s",
  1203. upstream:get_addr():to_string(true), err)
  1204. end
  1205. else
  1206. upload_clickhouse_schema(upstream, ev_base, cfg, false)
  1207. local version = tonumber(rows[1].v)
  1208. maybe_apply_migrations(upstream, ev_base, cfg, version)
  1209. end
  1210. if #settings.extra_columns > 0 then
  1211. add_extra_columns(upstream, ev_base, cfg)
  1212. end
  1213. end
  1214. local opts = rspamd_config:get_all_opt('clickhouse')
  1215. if opts then
  1216. -- Legacy `limit` options
  1217. if opts.limit and not opts.limits then
  1218. settings.limits.max_rows = opts.limit
  1219. end
  1220. for k, v in pairs(opts) do
  1221. if k == 'custom_rules' then
  1222. if not v[1] then
  1223. v = { v }
  1224. end
  1225. for i, rule in ipairs(v) do
  1226. if rule.schema and rule.first_row and rule.get_row then
  1227. local first_row, get_row
  1228. local loadstring = loadstring or load
  1229. local ret, res_or_err = pcall(loadstring(rule.first_row))
  1230. if not ret or type(res_or_err) ~= 'function' then
  1231. rspamd_logger.errx(rspamd_config, 'invalid first_row (%s) - must be a function',
  1232. res_or_err)
  1233. else
  1234. first_row = res_or_err
  1235. end
  1236. ret, res_or_err = pcall(loadstring(rule.get_row))
  1237. if not ret or type(res_or_err) ~= 'function' then
  1238. rspamd_logger.errx(rspamd_config,
  1239. 'invalid get_row (%s) - must be a function',
  1240. res_or_err)
  1241. else
  1242. get_row = res_or_err
  1243. end
  1244. if first_row and get_row then
  1245. local name = rule.name or tostring(i)
  1246. settings.custom_rules[name] = {
  1247. schema = rule.schema,
  1248. first_row = first_row,
  1249. get_row = get_row,
  1250. }
  1251. end
  1252. else
  1253. rspamd_logger.errx(rspamd_config, 'custom rule has no required attributes: schema, first_row and get_row')
  1254. end
  1255. end
  1256. else
  1257. settings[k] = lua_util.deepcopy(v)
  1258. end
  1259. end
  1260. if not settings['server'] and not settings['servers'] then
  1261. rspamd_logger.infox(rspamd_config, 'no servers are specified, disabling module')
  1262. lua_util.disable_module(N, "config")
  1263. else
  1264. local lua_maps = require "lua_maps"
  1265. settings['from_map'] = lua_maps.map_add('clickhouse', 'from_tables',
  1266. 'regexp', 'clickhouse specific domains')
  1267. settings.upstream = upstream_list.create(rspamd_config,
  1268. settings['server'] or settings['servers'], 8123)
  1269. if not settings.upstream then
  1270. rspamd_logger.errx(rspamd_config, 'cannot parse clickhouse address: %s',
  1271. settings['server'] or settings['servers'])
  1272. lua_util.disable_module(N, "config")
  1273. return
  1274. end
  1275. if settings.exceptions then
  1276. local maps_expressions = require "lua_maps_expressions"
  1277. settings.exceptions = maps_expressions.create(rspamd_config,
  1278. settings.exceptions, N)
  1279. end
  1280. if settings.extra_columns then
  1281. -- Check sanity and create selector closures
  1282. local lua_selectors = require "lua_selectors"
  1283. local columns_transformed = {}
  1284. local need_sort = false
  1285. -- Select traverse function depending on what we have
  1286. local iter_func = settings.extra_columns[1] and ipairs or pairs
  1287. for col_name, col_data in iter_func(settings.extra_columns) do
  1288. -- Array based extra columns
  1289. if col_data.name then
  1290. col_name = col_data.name
  1291. end
  1292. if not col_data.selector or not col_data.type then
  1293. rspamd_logger.errx(rspamd_config, 'cannot add clickhouse extra row %s: no type or no selector',
  1294. col_name)
  1295. else
  1296. local is_array = false
  1297. if col_data.type:lower():match('^array') then
  1298. is_array = true
  1299. end
  1300. local selector = lua_selectors.create_selector_closure(rspamd_config,
  1301. col_data.selector, col_data.delimiter or '', is_array)
  1302. if not selector then
  1303. rspamd_logger.errx(rspamd_config, 'cannot add clickhouse extra row %s: bad selector: %s',
  1304. col_name, col_data.selector)
  1305. else
  1306. if not col_data.default_value then
  1307. if is_array then
  1308. col_data.default_value = {}
  1309. else
  1310. col_data.default_value = ''
  1311. end
  1312. end
  1313. col_data.real_selector = selector
  1314. if not col_data.name then
  1315. col_data.name = col_name
  1316. need_sort = true
  1317. end
  1318. table.insert(columns_transformed, col_data)
  1319. end
  1320. end
  1321. end
  1322. -- Convert extra columns from a map to an array sorted by column name to
  1323. -- preserve strict order when doing altering
  1324. if need_sort then
  1325. rspamd_logger.infox(rspamd_config, 'sort extra columns as they are not configured as an array')
  1326. table.sort(columns_transformed, function(c1, c2)
  1327. return c1.name < c2.name
  1328. end)
  1329. end
  1330. settings.extra_columns = columns_transformed
  1331. end
  1332. rspamd_config:register_symbol({
  1333. name = 'CLICKHOUSE_COLLECT',
  1334. type = 'idempotent',
  1335. callback = clickhouse_collect,
  1336. flags = 'empty,explicit_disable,ignore_passthrough',
  1337. augmentations = { string.format("timeout=%f", settings.timeout) },
  1338. })
  1339. rspamd_config:register_finish_script(function(task)
  1340. if nrows > 0 then
  1341. final_call = true
  1342. local saved_rows = data_rows
  1343. local saved_custom = custom_rows
  1344. nrows = 0
  1345. data_rows = {}
  1346. used_memory = 0
  1347. custom_rows = {}
  1348. clickhouse_send_data(task, nil, 'final collection',
  1349. saved_rows, saved_custom)
  1350. if settings.collect_garbage then
  1351. collectgarbage()
  1352. end
  1353. end
  1354. end)
  1355. -- Create tables on load
  1356. rspamd_config:add_on_load(function(cfg, ev_base, worker)
  1357. if worker:is_scanner() then
  1358. rspamd_config:add_periodic(ev_base, 0,
  1359. clickhouse_maybe_send_data_periodic, true)
  1360. end
  1361. if worker:is_primary_controller() then
  1362. local upstreams = settings.upstream:all_upstreams()
  1363. for _, up in ipairs(upstreams) do
  1364. check_clickhouse_upstream(up, ev_base, cfg)
  1365. end
  1366. if settings.retention.enable and settings.retention.method ~= 'drop' and
  1367. settings.retention.method ~= 'detach' then
  1368. rspamd_logger.errx(rspamd_config,
  1369. "retention.method should be either 'drop' or 'detach' (now: %s). Disabling retention",
  1370. settings.retention.method)
  1371. settings.retention.enable = false
  1372. end
  1373. if settings.retention.enable and settings.retention.period_months < 1 or
  1374. settings.retention.period_months > 1000 then
  1375. rspamd_logger.errx(rspamd_config,
  1376. "please, set retention.period_months between 1 and 1000 (now: %s). Disabling retention",
  1377. settings.retention.period_months)
  1378. settings.retention.enable = false
  1379. end
  1380. local period = lua_util.parse_time_interval(settings.retention.run_every)
  1381. if settings.retention.enable and period == nil then
  1382. rspamd_logger.errx(rspamd_config, "invalid value for retention.run_every (%s). Disabling retention",
  1383. settings.retention.run_every)
  1384. settings.retention.enable = false
  1385. end
  1386. if settings.retention.enable then
  1387. settings.retention.period = period
  1388. rspamd_logger.infox(rspamd_config,
  1389. "retention will be performed each %s seconds for %s month with method %s",
  1390. period, settings.retention.period_months, settings.retention.method)
  1391. rspamd_config:add_periodic(ev_base, 0, clickhouse_remove_old_partitions, false)
  1392. end
  1393. end
  1394. end)
  1395. end
  1396. end