You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

fuzzy_backend_sqlite.c 28KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "rspamd.h"
  18. #include "fuzzy_backend.h"
  19. #include "fuzzy_backend_sqlite.h"
  20. #include "unix-std.h"
  21. #include <sqlite3.h>
  22. #include "libutil/sqlite_utils.h"
  23. struct rspamd_fuzzy_backend_sqlite {
  24. sqlite3 *db;
  25. char *path;
  26. char id[MEMPOOL_UID_LEN];
  27. gsize count;
  28. gsize expired;
  29. rspamd_mempool_t *pool;
  30. };
  31. static const double sql_sleep_time = 0.1;
  32. static const unsigned int max_retries = 10;
  33. #define msg_err_fuzzy_backend(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
  34. backend->pool->tag.tagname, backend->pool->tag.uid, \
  35. G_STRFUNC, \
  36. __VA_ARGS__)
  37. #define msg_warn_fuzzy_backend(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \
  38. backend->pool->tag.tagname, backend->pool->tag.uid, \
  39. G_STRFUNC, \
  40. __VA_ARGS__)
  41. #define msg_info_fuzzy_backend(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \
  42. backend->pool->tag.tagname, backend->pool->tag.uid, \
  43. G_STRFUNC, \
  44. __VA_ARGS__)
  45. #define msg_debug_fuzzy_backend(...) rspamd_conditional_debug_fast(NULL, NULL, \
  46. rspamd_fuzzy_sqlite_log_id, backend->pool->tag.tagname, backend->pool->tag.uid, \
  47. G_STRFUNC, \
  48. __VA_ARGS__)
  49. INIT_LOG_MODULE(fuzzy_sqlite)
  50. static const char *create_tables_sql =
  51. "BEGIN;"
  52. "CREATE TABLE IF NOT EXISTS digests("
  53. " id INTEGER PRIMARY KEY,"
  54. " flag INTEGER NOT NULL,"
  55. " digest TEXT NOT NULL,"
  56. " value INTEGER,"
  57. " time INTEGER);"
  58. "CREATE TABLE IF NOT EXISTS shingles("
  59. " value INTEGER NOT NULL,"
  60. " number INTEGER NOT NULL,"
  61. " digest_id INTEGER REFERENCES digests(id) ON DELETE CASCADE "
  62. " ON UPDATE CASCADE);"
  63. "CREATE TABLE IF NOT EXISTS sources("
  64. " name TEXT UNIQUE,"
  65. " version INTEGER,"
  66. " last INTEGER);"
  67. "CREATE UNIQUE INDEX IF NOT EXISTS d ON digests(digest);"
  68. "CREATE INDEX IF NOT EXISTS t ON digests(time);"
  69. "CREATE INDEX IF NOT EXISTS dgst_id ON shingles(digest_id);"
  70. "CREATE UNIQUE INDEX IF NOT EXISTS s ON shingles(value, number);"
  71. "COMMIT;";
  72. #if 0
  73. static const char *create_index_sql =
  74. "BEGIN;"
  75. "CREATE UNIQUE INDEX IF NOT EXISTS d ON digests(digest);"
  76. "CREATE INDEX IF NOT EXISTS t ON digests(time);"
  77. "CREATE INDEX IF NOT EXISTS dgst_id ON shingles(digest_id);"
  78. "CREATE UNIQUE INDEX IF NOT EXISTS s ON shingles(value, number);"
  79. "COMMIT;";
  80. #endif
  81. enum rspamd_fuzzy_statement_idx {
  82. RSPAMD_FUZZY_BACKEND_TRANSACTION_START = 0,
  83. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT,
  84. RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK,
  85. RSPAMD_FUZZY_BACKEND_INSERT,
  86. RSPAMD_FUZZY_BACKEND_UPDATE,
  87. RSPAMD_FUZZY_BACKEND_UPDATE_FLAG,
  88. RSPAMD_FUZZY_BACKEND_INSERT_SHINGLE,
  89. RSPAMD_FUZZY_BACKEND_CHECK,
  90. RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE,
  91. RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID,
  92. RSPAMD_FUZZY_BACKEND_DELETE,
  93. RSPAMD_FUZZY_BACKEND_COUNT,
  94. RSPAMD_FUZZY_BACKEND_EXPIRE,
  95. RSPAMD_FUZZY_BACKEND_VACUUM,
  96. RSPAMD_FUZZY_BACKEND_DELETE_ORPHANED,
  97. RSPAMD_FUZZY_BACKEND_ADD_SOURCE,
  98. RSPAMD_FUZZY_BACKEND_VERSION,
  99. RSPAMD_FUZZY_BACKEND_SET_VERSION,
  100. RSPAMD_FUZZY_BACKEND_MAX
  101. };
  102. static struct rspamd_fuzzy_stmts {
  103. enum rspamd_fuzzy_statement_idx idx;
  104. const char *sql;
  105. const char *args;
  106. sqlite3_stmt *stmt;
  107. int result;
  108. } prepared_stmts[RSPAMD_FUZZY_BACKEND_MAX] =
  109. {
  110. {.idx = RSPAMD_FUZZY_BACKEND_TRANSACTION_START,
  111. .sql = "BEGIN TRANSACTION;",
  112. .args = "",
  113. .stmt = NULL,
  114. .result = SQLITE_DONE},
  115. {.idx = RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT,
  116. .sql = "COMMIT;",
  117. .args = "",
  118. .stmt = NULL,
  119. .result = SQLITE_DONE},
  120. {.idx = RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK,
  121. .sql = "ROLLBACK;",
  122. .args = "",
  123. .stmt = NULL,
  124. .result = SQLITE_DONE},
  125. {.idx = RSPAMD_FUZZY_BACKEND_INSERT,
  126. .sql = "INSERT INTO digests(flag, digest, value, time) VALUES"
  127. "(?1, ?2, ?3, strftime('%s','now'));",
  128. .args = "SDI",
  129. .stmt = NULL,
  130. .result = SQLITE_DONE},
  131. {.idx = RSPAMD_FUZZY_BACKEND_UPDATE,
  132. .sql = "UPDATE digests SET value = value + ?1, time = strftime('%s','now') WHERE "
  133. "digest==?2;",
  134. .args = "ID",
  135. .stmt = NULL,
  136. .result = SQLITE_DONE},
  137. {.idx = RSPAMD_FUZZY_BACKEND_UPDATE_FLAG,
  138. .sql = "UPDATE digests SET value = ?1, flag = ?2, time = strftime('%s','now') WHERE "
  139. "digest==?3;",
  140. .args = "IID",
  141. .stmt = NULL,
  142. .result = SQLITE_DONE},
  143. {.idx = RSPAMD_FUZZY_BACKEND_INSERT_SHINGLE,
  144. .sql = "INSERT OR REPLACE INTO shingles(value, number, digest_id) "
  145. "VALUES (?1, ?2, ?3);",
  146. .args = "III",
  147. .stmt = NULL,
  148. .result = SQLITE_DONE},
  149. {.idx = RSPAMD_FUZZY_BACKEND_CHECK,
  150. .sql = "SELECT value, time, flag FROM digests WHERE digest==?1;",
  151. .args = "D",
  152. .stmt = NULL,
  153. .result = SQLITE_ROW},
  154. {.idx = RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE,
  155. .sql = "SELECT digest_id FROM shingles WHERE value=?1 AND number=?2",
  156. .args = "IS",
  157. .stmt = NULL,
  158. .result = SQLITE_ROW},
  159. {.idx = RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID,
  160. .sql = "SELECT digest, value, time, flag FROM digests WHERE id=?1",
  161. .args = "I",
  162. .stmt = NULL,
  163. .result = SQLITE_ROW},
  164. {.idx = RSPAMD_FUZZY_BACKEND_DELETE,
  165. .sql = "DELETE FROM digests WHERE digest==?1;",
  166. .args = "D",
  167. .stmt = NULL,
  168. .result = SQLITE_DONE},
  169. {.idx = RSPAMD_FUZZY_BACKEND_COUNT,
  170. .sql = "SELECT COUNT(*) FROM digests;",
  171. .args = "",
  172. .stmt = NULL,
  173. .result = SQLITE_ROW},
  174. {.idx = RSPAMD_FUZZY_BACKEND_EXPIRE,
  175. .sql = "DELETE FROM digests WHERE id IN (SELECT id FROM digests WHERE time < ?1 LIMIT ?2);",
  176. .args = "II",
  177. .stmt = NULL,
  178. .result = SQLITE_DONE},
  179. {.idx = RSPAMD_FUZZY_BACKEND_VACUUM,
  180. .sql = "VACUUM;",
  181. .args = "",
  182. .stmt = NULL,
  183. .result = SQLITE_DONE},
  184. {.idx = RSPAMD_FUZZY_BACKEND_DELETE_ORPHANED,
  185. .sql = "DELETE FROM shingles WHERE value=?1 AND number=?2;",
  186. .args = "II",
  187. .stmt = NULL,
  188. .result = SQLITE_DONE},
  189. {.idx = RSPAMD_FUZZY_BACKEND_ADD_SOURCE,
  190. .sql = "INSERT OR IGNORE INTO sources(name, version, last) VALUES (?1, ?2, ?3);",
  191. .args = "TII",
  192. .stmt = NULL,
  193. .result = SQLITE_DONE},
  194. {.idx = RSPAMD_FUZZY_BACKEND_VERSION,
  195. .sql = "SELECT version FROM sources WHERE name=?1;",
  196. .args = "T",
  197. .stmt = NULL,
  198. .result = SQLITE_ROW},
  199. {.idx = RSPAMD_FUZZY_BACKEND_SET_VERSION,
  200. .sql = "INSERT OR REPLACE INTO sources (name, version, last) VALUES (?3, ?1, ?2);",
  201. .args = "IIT",
  202. .stmt = NULL,
  203. .result = SQLITE_DONE},
  204. };
  205. static GQuark
  206. rspamd_fuzzy_backend_sqlite_quark(void)
  207. {
  208. return g_quark_from_static_string("fuzzy-backend-sqlite");
  209. }
  210. static gboolean
  211. rspamd_fuzzy_backend_sqlite_prepare_stmts(struct rspamd_fuzzy_backend_sqlite *bk, GError **err)
  212. {
  213. int i;
  214. for (i = 0; i < RSPAMD_FUZZY_BACKEND_MAX; i++) {
  215. if (prepared_stmts[i].stmt != NULL) {
  216. /* Skip already prepared statements */
  217. continue;
  218. }
  219. if (sqlite3_prepare_v2(bk->db, prepared_stmts[i].sql, -1,
  220. &prepared_stmts[i].stmt, NULL) != SQLITE_OK) {
  221. g_set_error(err, rspamd_fuzzy_backend_sqlite_quark(),
  222. -1, "Cannot initialize prepared sql `%s`: %s",
  223. prepared_stmts[i].sql, sqlite3_errmsg(bk->db));
  224. return FALSE;
  225. }
  226. }
  227. return TRUE;
  228. }
  229. static int
  230. rspamd_fuzzy_backend_sqlite_cleanup_stmt(struct rspamd_fuzzy_backend_sqlite *backend,
  231. int idx)
  232. {
  233. sqlite3_stmt *stmt;
  234. if (idx < 0 || idx >= RSPAMD_FUZZY_BACKEND_MAX) {
  235. return -1;
  236. }
  237. msg_debug_fuzzy_backend("resetting `%s`", prepared_stmts[idx].sql);
  238. stmt = prepared_stmts[idx].stmt;
  239. sqlite3_clear_bindings(stmt);
  240. sqlite3_reset(stmt);
  241. return SQLITE_OK;
  242. }
  243. static int
  244. rspamd_fuzzy_backend_sqlite_run_stmt(struct rspamd_fuzzy_backend_sqlite *backend,
  245. gboolean auto_cleanup,
  246. int idx, ...)
  247. {
  248. int retcode;
  249. va_list ap;
  250. sqlite3_stmt *stmt;
  251. int i;
  252. const char *argtypes;
  253. unsigned int retries = 0;
  254. struct timespec ts;
  255. if (idx < 0 || idx >= RSPAMD_FUZZY_BACKEND_MAX) {
  256. return -1;
  257. }
  258. stmt = prepared_stmts[idx].stmt;
  259. g_assert((int) prepared_stmts[idx].idx == idx);
  260. if (stmt == NULL) {
  261. if ((retcode = sqlite3_prepare_v2(backend->db, prepared_stmts[idx].sql, -1,
  262. &prepared_stmts[idx].stmt, NULL)) != SQLITE_OK) {
  263. msg_err_fuzzy_backend("Cannot initialize prepared sql `%s`: %s",
  264. prepared_stmts[idx].sql, sqlite3_errmsg(backend->db));
  265. return retcode;
  266. }
  267. stmt = prepared_stmts[idx].stmt;
  268. }
  269. msg_debug_fuzzy_backend("executing `%s` %s auto cleanup",
  270. prepared_stmts[idx].sql, auto_cleanup ? "with" : "without");
  271. argtypes = prepared_stmts[idx].args;
  272. sqlite3_clear_bindings(stmt);
  273. sqlite3_reset(stmt);
  274. va_start(ap, idx);
  275. for (i = 0; argtypes[i] != '\0'; i++) {
  276. switch (argtypes[i]) {
  277. case 'T':
  278. sqlite3_bind_text(stmt, i + 1, va_arg(ap, const char *), -1,
  279. SQLITE_STATIC);
  280. break;
  281. case 'I':
  282. sqlite3_bind_int64(stmt, i + 1, va_arg(ap, int64_t));
  283. break;
  284. case 'S':
  285. sqlite3_bind_int(stmt, i + 1, va_arg(ap, int));
  286. break;
  287. case 'D':
  288. /* Special case for digests variable */
  289. sqlite3_bind_text(stmt, i + 1, va_arg(ap, const char *), 64,
  290. SQLITE_STATIC);
  291. break;
  292. }
  293. }
  294. va_end(ap);
  295. retry:
  296. retcode = sqlite3_step(stmt);
  297. if (retcode == prepared_stmts[idx].result) {
  298. retcode = SQLITE_OK;
  299. }
  300. else {
  301. if ((retcode == SQLITE_BUSY ||
  302. retcode == SQLITE_LOCKED) &&
  303. retries++ < max_retries) {
  304. double_to_ts(sql_sleep_time, &ts);
  305. nanosleep(&ts, NULL);
  306. goto retry;
  307. }
  308. msg_debug_fuzzy_backend("failed to execute query %s: %d, %s", prepared_stmts[idx].sql,
  309. retcode, sqlite3_errmsg(backend->db));
  310. }
  311. if (auto_cleanup) {
  312. sqlite3_clear_bindings(stmt);
  313. sqlite3_reset(stmt);
  314. }
  315. return retcode;
  316. }
  317. static void
  318. rspamd_fuzzy_backend_sqlite_close_stmts(struct rspamd_fuzzy_backend_sqlite *bk)
  319. {
  320. int i;
  321. for (i = 0; i < RSPAMD_FUZZY_BACKEND_MAX; i++) {
  322. if (prepared_stmts[i].stmt != NULL) {
  323. sqlite3_finalize(prepared_stmts[i].stmt);
  324. prepared_stmts[i].stmt = NULL;
  325. }
  326. }
  327. return;
  328. }
  329. static gboolean
  330. rspamd_fuzzy_backend_sqlite_run_sql(const char *sql, struct rspamd_fuzzy_backend_sqlite *bk,
  331. GError **err)
  332. {
  333. unsigned int retries = 0;
  334. struct timespec ts;
  335. int ret;
  336. do {
  337. ret = sqlite3_exec(bk->db, sql, NULL, NULL, NULL);
  338. double_to_ts(sql_sleep_time, &ts);
  339. } while (ret == SQLITE_BUSY && retries++ < max_retries &&
  340. nanosleep(&ts, NULL) == 0);
  341. if (ret != SQLITE_OK) {
  342. g_set_error(err, rspamd_fuzzy_backend_sqlite_quark(),
  343. -1, "Cannot execute raw sql `%s`: %s",
  344. sql, sqlite3_errmsg(bk->db));
  345. return FALSE;
  346. }
  347. return TRUE;
  348. }
  349. static struct rspamd_fuzzy_backend_sqlite *
  350. rspamd_fuzzy_backend_sqlite_open_db(const char *path, GError **err)
  351. {
  352. struct rspamd_fuzzy_backend_sqlite *bk;
  353. rspamd_cryptobox_hash_state_t st;
  354. unsigned char hash_out[rspamd_cryptobox_HASHBYTES];
  355. g_assert(path != NULL);
  356. bk = g_malloc0(sizeof(*bk));
  357. bk->path = g_strdup(path);
  358. bk->expired = 0;
  359. bk->pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
  360. "fuzzy_backend", 0);
  361. bk->db = rspamd_sqlite3_open_or_create(bk->pool, bk->path,
  362. create_tables_sql, 1, err);
  363. if (bk->db == NULL) {
  364. rspamd_fuzzy_backend_sqlite_close(bk);
  365. return NULL;
  366. }
  367. if (!rspamd_fuzzy_backend_sqlite_prepare_stmts(bk, err)) {
  368. rspamd_fuzzy_backend_sqlite_close(bk);
  369. return NULL;
  370. }
  371. /* Set id for the backend */
  372. rspamd_cryptobox_hash_init(&st, NULL, 0);
  373. rspamd_cryptobox_hash_update(&st, path, strlen(path));
  374. rspamd_cryptobox_hash_final(&st, hash_out);
  375. rspamd_snprintf(bk->id, sizeof(bk->id), "%xs", hash_out);
  376. memcpy(bk->pool->tag.uid, bk->id, sizeof(bk->pool->tag.uid));
  377. return bk;
  378. }
  379. struct rspamd_fuzzy_backend_sqlite *
  380. rspamd_fuzzy_backend_sqlite_open(const char *path,
  381. gboolean vacuum,
  382. GError **err)
  383. {
  384. struct rspamd_fuzzy_backend_sqlite *backend;
  385. if (path == NULL) {
  386. g_set_error(err, rspamd_fuzzy_backend_sqlite_quark(),
  387. ENOENT, "Path has not been specified");
  388. return NULL;
  389. }
  390. /* Open database */
  391. if ((backend = rspamd_fuzzy_backend_sqlite_open_db(path, err)) == NULL) {
  392. return NULL;
  393. }
  394. if (rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, RSPAMD_FUZZY_BACKEND_COUNT) == SQLITE_OK) {
  395. backend->count = sqlite3_column_int64(
  396. prepared_stmts[RSPAMD_FUZZY_BACKEND_COUNT].stmt, 0);
  397. }
  398. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_COUNT);
  399. return backend;
  400. }
  401. static int
  402. rspamd_fuzzy_backend_sqlite_int64_cmp(const void *a, const void *b)
  403. {
  404. int64_t ia = *(int64_t *) a, ib = *(int64_t *) b;
  405. return (ia - ib);
  406. }
  407. struct rspamd_fuzzy_reply
  408. rspamd_fuzzy_backend_sqlite_check(struct rspamd_fuzzy_backend_sqlite *backend,
  409. const struct rspamd_fuzzy_cmd *cmd, int64_t expire)
  410. {
  411. struct rspamd_fuzzy_reply rep;
  412. const struct rspamd_fuzzy_shingle_cmd *shcmd;
  413. int rc;
  414. int64_t timestamp;
  415. int64_t shingle_values[RSPAMD_SHINGLE_SIZE], i, sel_id, cur_id,
  416. cur_cnt, max_cnt;
  417. memset(&rep, 0, sizeof(rep));
  418. memcpy(rep.digest, cmd->digest, sizeof(rep.digest));
  419. if (backend == NULL) {
  420. return rep;
  421. }
  422. /* Try direct match first of all */
  423. rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  424. RSPAMD_FUZZY_BACKEND_TRANSACTION_START);
  425. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE,
  426. RSPAMD_FUZZY_BACKEND_CHECK,
  427. cmd->digest);
  428. if (rc == SQLITE_OK) {
  429. timestamp = sqlite3_column_int64(
  430. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, 1);
  431. if (time(NULL) - timestamp > expire) {
  432. /* Expire element */
  433. msg_debug_fuzzy_backend("requested hash has been expired");
  434. }
  435. else {
  436. rep.v1.value = sqlite3_column_int64(
  437. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, 0);
  438. rep.v1.prob = 1.0;
  439. rep.v1.flag = sqlite3_column_int(
  440. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, 2);
  441. }
  442. }
  443. else if (cmd->shingles_count > 0) {
  444. /* Fuzzy match */
  445. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK);
  446. shcmd = (const struct rspamd_fuzzy_shingle_cmd *) cmd;
  447. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
  448. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE,
  449. RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE,
  450. shcmd->sgl.hashes[i], i);
  451. if (rc == SQLITE_OK) {
  452. shingle_values[i] = sqlite3_column_int64(
  453. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE].stmt,
  454. 0);
  455. }
  456. else {
  457. shingle_values[i] = -1;
  458. }
  459. msg_debug_fuzzy_backend("looking for shingle %L -> %L: %d", i,
  460. shcmd->sgl.hashes[i], rc);
  461. }
  462. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend,
  463. RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE);
  464. qsort(shingle_values, RSPAMD_SHINGLE_SIZE, sizeof(int64_t),
  465. rspamd_fuzzy_backend_sqlite_int64_cmp);
  466. sel_id = -1;
  467. cur_id = -1;
  468. cur_cnt = 0;
  469. max_cnt = 0;
  470. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
  471. if (shingle_values[i] == -1) {
  472. continue;
  473. }
  474. /* We have some value here, so we need to check it */
  475. if (shingle_values[i] == cur_id) {
  476. cur_cnt++;
  477. }
  478. else {
  479. cur_id = shingle_values[i];
  480. if (cur_cnt >= max_cnt) {
  481. max_cnt = cur_cnt;
  482. sel_id = cur_id;
  483. }
  484. cur_cnt = 0;
  485. }
  486. }
  487. if (cur_cnt > max_cnt) {
  488. max_cnt = cur_cnt;
  489. }
  490. if (sel_id != -1) {
  491. /* We have some id selected here */
  492. rep.v1.prob = (float) max_cnt / (float) RSPAMD_SHINGLE_SIZE;
  493. if (rep.v1.prob > 0.5) {
  494. msg_debug_fuzzy_backend(
  495. "found fuzzy hash with probability %.2f",
  496. rep.v1.prob);
  497. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE,
  498. RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID, sel_id);
  499. if (rc == SQLITE_OK) {
  500. timestamp = sqlite3_column_int64(
  501. prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt,
  502. 2);
  503. if (time(NULL) - timestamp > expire) {
  504. /* Expire element */
  505. msg_debug_fuzzy_backend(
  506. "requested hash has been expired");
  507. rep.v1.prob = 0.0;
  508. }
  509. else {
  510. rep.ts = timestamp;
  511. memcpy(rep.digest, sqlite3_column_blob(prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt, 0), sizeof(rep.digest));
  512. rep.v1.value = sqlite3_column_int64(
  513. prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt,
  514. 1);
  515. rep.v1.flag = sqlite3_column_int(
  516. prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt,
  517. 3);
  518. }
  519. }
  520. }
  521. else {
  522. /* Otherwise we assume that as error */
  523. rep.v1.value = 0;
  524. }
  525. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend,
  526. RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID);
  527. }
  528. }
  529. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK);
  530. rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  531. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT);
  532. return rep;
  533. }
  534. gboolean
  535. rspamd_fuzzy_backend_sqlite_prepare_update(struct rspamd_fuzzy_backend_sqlite *backend,
  536. const char *source)
  537. {
  538. int rc;
  539. if (backend == NULL) {
  540. return FALSE;
  541. }
  542. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  543. RSPAMD_FUZZY_BACKEND_TRANSACTION_START);
  544. if (rc != SQLITE_OK) {
  545. msg_warn_fuzzy_backend("cannot start transaction for updates: %s",
  546. sqlite3_errmsg(backend->db));
  547. return FALSE;
  548. }
  549. return TRUE;
  550. }
  551. gboolean
  552. rspamd_fuzzy_backend_sqlite_add(struct rspamd_fuzzy_backend_sqlite *backend,
  553. const struct rspamd_fuzzy_cmd *cmd)
  554. {
  555. int rc, i;
  556. int64_t id, flag;
  557. const struct rspamd_fuzzy_shingle_cmd *shcmd;
  558. if (backend == NULL) {
  559. return FALSE;
  560. }
  561. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE,
  562. RSPAMD_FUZZY_BACKEND_CHECK,
  563. cmd->digest);
  564. if (rc == SQLITE_OK) {
  565. /* Check flag */
  566. flag = sqlite3_column_int64(
  567. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt,
  568. 2);
  569. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK);
  570. if (flag == cmd->flag) {
  571. /* We need to increase weight */
  572. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  573. RSPAMD_FUZZY_BACKEND_UPDATE,
  574. (int64_t) cmd->value,
  575. cmd->digest);
  576. if (rc != SQLITE_OK) {
  577. msg_warn_fuzzy_backend("cannot update hash to %d -> "
  578. "%*xs: %s",
  579. (int) cmd->flag,
  580. (int) sizeof(cmd->digest), cmd->digest,
  581. sqlite3_errmsg(backend->db));
  582. }
  583. }
  584. else {
  585. /* We need to relearn actually */
  586. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  587. RSPAMD_FUZZY_BACKEND_UPDATE_FLAG,
  588. (int64_t) cmd->value,
  589. (int64_t) cmd->flag,
  590. cmd->digest);
  591. if (rc != SQLITE_OK) {
  592. msg_warn_fuzzy_backend("cannot update hash to %d -> "
  593. "%*xs: %s",
  594. (int) cmd->flag,
  595. (int) sizeof(cmd->digest), cmd->digest,
  596. sqlite3_errmsg(backend->db));
  597. }
  598. }
  599. }
  600. else {
  601. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK);
  602. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE,
  603. RSPAMD_FUZZY_BACKEND_INSERT,
  604. (int) cmd->flag,
  605. cmd->digest,
  606. (int64_t) cmd->value);
  607. if (rc == SQLITE_OK) {
  608. if (cmd->shingles_count > 0) {
  609. id = sqlite3_last_insert_rowid(backend->db);
  610. shcmd = (const struct rspamd_fuzzy_shingle_cmd *) cmd;
  611. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
  612. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  613. RSPAMD_FUZZY_BACKEND_INSERT_SHINGLE,
  614. shcmd->sgl.hashes[i], (int64_t) i, id);
  615. msg_debug_fuzzy_backend("add shingle %d -> %L: %L",
  616. i,
  617. shcmd->sgl.hashes[i],
  618. id);
  619. if (rc != SQLITE_OK) {
  620. msg_warn_fuzzy_backend("cannot add shingle %d -> "
  621. "%L: %L: %s",
  622. i,
  623. shcmd->sgl.hashes[i],
  624. id, sqlite3_errmsg(backend->db));
  625. }
  626. }
  627. }
  628. }
  629. else {
  630. msg_warn_fuzzy_backend("cannot add hash to %d -> "
  631. "%*xs: %s",
  632. (int) cmd->flag,
  633. (int) sizeof(cmd->digest), cmd->digest,
  634. sqlite3_errmsg(backend->db));
  635. }
  636. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend,
  637. RSPAMD_FUZZY_BACKEND_INSERT);
  638. }
  639. return (rc == SQLITE_OK);
  640. }
  641. gboolean
  642. rspamd_fuzzy_backend_sqlite_finish_update(struct rspamd_fuzzy_backend_sqlite *backend,
  643. const char *source, gboolean version_bump)
  644. {
  645. int rc = SQLITE_OK, wal_frames, wal_checkpointed, ver;
  646. /* Get and update version */
  647. if (version_bump) {
  648. ver = rspamd_fuzzy_backend_sqlite_version(backend, source);
  649. ++ver;
  650. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  651. RSPAMD_FUZZY_BACKEND_SET_VERSION,
  652. (int64_t) ver, (int64_t) time(NULL), source);
  653. }
  654. if (rc == SQLITE_OK) {
  655. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  656. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT);
  657. if (rc != SQLITE_OK) {
  658. msg_warn_fuzzy_backend("cannot commit updates: %s",
  659. sqlite3_errmsg(backend->db));
  660. rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  661. RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK);
  662. return FALSE;
  663. }
  664. else {
  665. if (!rspamd_sqlite3_sync(backend->db, &wal_frames, &wal_checkpointed)) {
  666. msg_warn_fuzzy_backend("cannot commit checkpoint: %s",
  667. sqlite3_errmsg(backend->db));
  668. }
  669. else if (wal_checkpointed > 0) {
  670. msg_info_fuzzy_backend("total number of frames in the wal file: "
  671. "%d, checkpointed: %d",
  672. wal_frames, wal_checkpointed);
  673. }
  674. }
  675. }
  676. else {
  677. msg_warn_fuzzy_backend("cannot update version for %s: %s", source,
  678. sqlite3_errmsg(backend->db));
  679. rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  680. RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK);
  681. return FALSE;
  682. }
  683. return TRUE;
  684. }
  685. gboolean
  686. rspamd_fuzzy_backend_sqlite_del(struct rspamd_fuzzy_backend_sqlite *backend,
  687. const struct rspamd_fuzzy_cmd *cmd)
  688. {
  689. int rc = -1;
  690. if (backend == NULL) {
  691. return FALSE;
  692. }
  693. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE,
  694. RSPAMD_FUZZY_BACKEND_CHECK,
  695. cmd->digest);
  696. if (rc == SQLITE_OK) {
  697. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK);
  698. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  699. RSPAMD_FUZZY_BACKEND_DELETE,
  700. cmd->digest);
  701. if (rc != SQLITE_OK) {
  702. msg_warn_fuzzy_backend("cannot update hash to %d -> "
  703. "%*xs: %s",
  704. (int) cmd->flag,
  705. (int) sizeof(cmd->digest), cmd->digest,
  706. sqlite3_errmsg(backend->db));
  707. }
  708. }
  709. else {
  710. /* Hash is missing */
  711. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK);
  712. }
  713. return (rc == SQLITE_OK);
  714. }
  715. gboolean
  716. rspamd_fuzzy_backend_sqlite_sync(struct rspamd_fuzzy_backend_sqlite *backend,
  717. int64_t expire,
  718. gboolean clean_orphaned)
  719. {
  720. struct orphaned_shingle_elt {
  721. int64_t value;
  722. int64_t number;
  723. };
  724. /* Do not do more than 5k ops per step */
  725. const uint64_t max_changes = 5000;
  726. gboolean ret = FALSE;
  727. int64_t expire_lim, expired;
  728. int rc, i, orphaned_cnt = 0;
  729. GError *err = NULL;
  730. static const char orphaned_shingles[] = "SELECT shingles.value,shingles.number "
  731. "FROM shingles "
  732. "LEFT JOIN digests ON "
  733. "shingles.digest_id=digests.id WHERE "
  734. "digests.id IS NULL;";
  735. sqlite3_stmt *stmt;
  736. GArray *orphaned;
  737. struct orphaned_shingle_elt orphaned_elt, *pelt;
  738. if (backend == NULL) {
  739. return FALSE;
  740. }
  741. /* Perform expire */
  742. if (expire > 0) {
  743. expire_lim = time(NULL) - expire;
  744. if (expire_lim > 0) {
  745. ret = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  746. RSPAMD_FUZZY_BACKEND_TRANSACTION_START);
  747. if (ret == SQLITE_OK) {
  748. rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE,
  749. RSPAMD_FUZZY_BACKEND_EXPIRE, expire_lim, max_changes);
  750. if (rc == SQLITE_OK) {
  751. expired = sqlite3_changes(backend->db);
  752. if (expired > 0) {
  753. backend->expired += expired;
  754. msg_info_fuzzy_backend("expired %L hashes", expired);
  755. }
  756. }
  757. else {
  758. msg_warn_fuzzy_backend(
  759. "cannot execute expired statement: %s",
  760. sqlite3_errmsg(backend->db));
  761. }
  762. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend,
  763. RSPAMD_FUZZY_BACKEND_EXPIRE);
  764. ret = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  765. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT);
  766. if (ret != SQLITE_OK) {
  767. rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  768. RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK);
  769. }
  770. }
  771. if (ret != SQLITE_OK) {
  772. msg_warn_fuzzy_backend("cannot expire db: %s",
  773. sqlite3_errmsg(backend->db));
  774. }
  775. }
  776. }
  777. /* Cleanup database */
  778. if (clean_orphaned) {
  779. ret = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  780. RSPAMD_FUZZY_BACKEND_TRANSACTION_START);
  781. if (ret == SQLITE_OK) {
  782. if ((rc = sqlite3_prepare_v2(backend->db,
  783. orphaned_shingles,
  784. -1,
  785. &stmt,
  786. NULL)) != SQLITE_OK) {
  787. msg_warn_fuzzy_backend("cannot cleanup shingles: %s",
  788. sqlite3_errmsg(backend->db));
  789. }
  790. else {
  791. orphaned = g_array_new(FALSE,
  792. FALSE,
  793. sizeof(struct orphaned_shingle_elt));
  794. while (sqlite3_step(stmt) == SQLITE_ROW) {
  795. orphaned_elt.value = sqlite3_column_int64(stmt, 0);
  796. orphaned_elt.number = sqlite3_column_int64(stmt, 1);
  797. g_array_append_val(orphaned, orphaned_elt);
  798. if (orphaned->len > max_changes) {
  799. break;
  800. }
  801. }
  802. sqlite3_finalize(stmt);
  803. orphaned_cnt = orphaned->len;
  804. if (orphaned_cnt > 0) {
  805. msg_info_fuzzy_backend(
  806. "going to delete %ud orphaned shingles",
  807. orphaned_cnt);
  808. /* Need to delete orphaned elements */
  809. for (i = 0; i < (int) orphaned_cnt; i++) {
  810. pelt = &g_array_index(orphaned,
  811. struct orphaned_shingle_elt,
  812. i);
  813. rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  814. RSPAMD_FUZZY_BACKEND_DELETE_ORPHANED,
  815. pelt->value, pelt->number);
  816. }
  817. }
  818. g_array_free(orphaned, TRUE);
  819. }
  820. ret = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  821. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT);
  822. if (ret == SQLITE_OK) {
  823. msg_info_fuzzy_backend(
  824. "deleted %ud orphaned shingles",
  825. orphaned_cnt);
  826. }
  827. else {
  828. msg_warn_fuzzy_backend(
  829. "cannot synchronize fuzzy backend: %e",
  830. err);
  831. rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE,
  832. RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK);
  833. }
  834. }
  835. }
  836. return ret;
  837. }
  838. void rspamd_fuzzy_backend_sqlite_close(struct rspamd_fuzzy_backend_sqlite *backend)
  839. {
  840. if (backend != NULL) {
  841. if (backend->db != NULL) {
  842. rspamd_fuzzy_backend_sqlite_close_stmts(backend);
  843. sqlite3_close(backend->db);
  844. }
  845. if (backend->path != NULL) {
  846. g_free(backend->path);
  847. }
  848. if (backend->pool) {
  849. rspamd_mempool_delete(backend->pool);
  850. }
  851. g_free(backend);
  852. }
  853. }
  854. gsize rspamd_fuzzy_backend_sqlite_count(struct rspamd_fuzzy_backend_sqlite *backend)
  855. {
  856. if (backend) {
  857. if (rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE,
  858. RSPAMD_FUZZY_BACKEND_COUNT) == SQLITE_OK) {
  859. backend->count = sqlite3_column_int64(
  860. prepared_stmts[RSPAMD_FUZZY_BACKEND_COUNT].stmt, 0);
  861. }
  862. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_COUNT);
  863. return backend->count;
  864. }
  865. return 0;
  866. }
  867. int rspamd_fuzzy_backend_sqlite_version(struct rspamd_fuzzy_backend_sqlite *backend,
  868. const char *source)
  869. {
  870. int ret = 0;
  871. if (backend) {
  872. if (rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE,
  873. RSPAMD_FUZZY_BACKEND_VERSION, source) == SQLITE_OK) {
  874. ret = sqlite3_column_int64(
  875. prepared_stmts[RSPAMD_FUZZY_BACKEND_VERSION].stmt, 0);
  876. }
  877. rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_VERSION);
  878. }
  879. return ret;
  880. }
  881. gsize rspamd_fuzzy_backend_sqlite_expired(struct rspamd_fuzzy_backend_sqlite *backend)
  882. {
  883. return backend != NULL ? backend->expired : 0;
  884. }
  885. const char *
  886. rspamd_fuzzy_sqlite_backend_id(struct rspamd_fuzzy_backend_sqlite *backend)
  887. {
  888. return backend != NULL ? backend->id : 0;
  889. }