You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

fuzzy_backend.c 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "rspamd.h"
  18. #include "fuzzy_backend.h"
  19. #include "unix-std.h"
  20. #include <sqlite3.h>
  21. #include "libutil/sqlite_utils.h"
  22. struct rspamd_fuzzy_backend {
  23. sqlite3 *db;
  24. char *path;
  25. gchar id[MEMPOOL_UID_LEN];
  26. gsize count;
  27. gsize expired;
  28. rspamd_mempool_t *pool;
  29. };
  30. static const gdouble sql_sleep_time = 0.1;
  31. static const guint max_retries = 10;
  32. #define msg_err_fuzzy_backend(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
  33. backend->pool->tag.tagname, backend->pool->tag.uid, \
  34. G_STRFUNC, \
  35. __VA_ARGS__)
  36. #define msg_warn_fuzzy_backend(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
  37. backend->pool->tag.tagname, backend->pool->tag.uid, \
  38. G_STRFUNC, \
  39. __VA_ARGS__)
  40. #define msg_info_fuzzy_backend(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \
  41. backend->pool->tag.tagname, backend->pool->tag.uid, \
  42. G_STRFUNC, \
  43. __VA_ARGS__)
  44. #define msg_debug_fuzzy_backend(...) rspamd_default_log_function (G_LOG_LEVEL_DEBUG, \
  45. backend->pool->tag.tagname, backend->pool->tag.uid, \
  46. G_STRFUNC, \
  47. __VA_ARGS__)
  48. static const char *create_tables_sql =
  49. "BEGIN;"
  50. "CREATE TABLE digests("
  51. "id INTEGER PRIMARY KEY,"
  52. "flag INTEGER NOT NULL,"
  53. "digest TEXT NOT NULL,"
  54. "value INTEGER,"
  55. "time INTEGER);"
  56. "CREATE TABLE shingles("
  57. "value INTEGER NOT NULL,"
  58. "number INTEGER NOT NULL,"
  59. "digest_id INTEGER REFERENCES digests(id) ON DELETE CASCADE "
  60. "ON UPDATE CASCADE);"
  61. "CREATE UNIQUE INDEX IF NOT EXISTS d ON digests(digest);"
  62. "CREATE INDEX IF NOT EXISTS t ON digests(time);"
  63. "CREATE INDEX IF NOT EXISTS dgst_id ON shingles(digest_id);"
  64. "CREATE UNIQUE INDEX IF NOT EXISTS s ON shingles(value, number);"
  65. "COMMIT;";
  66. #if 0
  67. static const char *create_index_sql =
  68. "BEGIN;"
  69. "CREATE UNIQUE INDEX IF NOT EXISTS d ON digests(digest);"
  70. "CREATE INDEX IF NOT EXISTS t ON digests(time);"
  71. "CREATE INDEX IF NOT EXISTS dgst_id ON shingles(digest_id);"
  72. "CREATE UNIQUE INDEX IF NOT EXISTS s ON shingles(value, number);"
  73. "COMMIT;";
  74. #endif
  75. enum rspamd_fuzzy_statement_idx {
  76. RSPAMD_FUZZY_BACKEND_TRANSACTION_START = 0,
  77. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT,
  78. RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK,
  79. RSPAMD_FUZZY_BACKEND_INSERT,
  80. RSPAMD_FUZZY_BACKEND_UPDATE,
  81. RSPAMD_FUZZY_BACKEND_UPDATE_FLAG,
  82. RSPAMD_FUZZY_BACKEND_INSERT_SHINGLE,
  83. RSPAMD_FUZZY_BACKEND_CHECK,
  84. RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE,
  85. RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID,
  86. RSPAMD_FUZZY_BACKEND_DELETE,
  87. RSPAMD_FUZZY_BACKEND_COUNT,
  88. RSPAMD_FUZZY_BACKEND_EXPIRE,
  89. RSPAMD_FUZZY_BACKEND_VACUUM,
  90. RSPAMD_FUZZY_BACKEND_DELETE_ORPHANED,
  91. RSPAMD_FUZZY_BACKEND_MAX
  92. };
  93. static struct rspamd_fuzzy_stmts {
  94. enum rspamd_fuzzy_statement_idx idx;
  95. const gchar *sql;
  96. const gchar *args;
  97. sqlite3_stmt *stmt;
  98. gint result;
  99. } prepared_stmts[RSPAMD_FUZZY_BACKEND_MAX] =
  100. {
  101. {
  102. .idx = RSPAMD_FUZZY_BACKEND_TRANSACTION_START,
  103. .sql = "BEGIN TRANSACTION;",
  104. .args = "",
  105. .stmt = NULL,
  106. .result = SQLITE_DONE
  107. },
  108. {
  109. .idx = RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT,
  110. .sql = "COMMIT;",
  111. .args = "",
  112. .stmt = NULL,
  113. .result = SQLITE_DONE
  114. },
  115. {
  116. .idx = RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK,
  117. .sql = "ROLLBACK;",
  118. .args = "",
  119. .stmt = NULL,
  120. .result = SQLITE_DONE
  121. },
  122. {
  123. .idx = RSPAMD_FUZZY_BACKEND_INSERT,
  124. .sql = "INSERT INTO digests(flag, digest, value, time) VALUES"
  125. "(?1, ?2, ?3, ?4);",
  126. .args = "SDII",
  127. .stmt = NULL,
  128. .result = SQLITE_DONE
  129. },
  130. {
  131. .idx = RSPAMD_FUZZY_BACKEND_UPDATE,
  132. .sql = "UPDATE digests SET value = value + ?1 WHERE "
  133. "digest==?2;",
  134. .args = "ID",
  135. .stmt = NULL,
  136. .result = SQLITE_DONE
  137. },
  138. {
  139. .idx = RSPAMD_FUZZY_BACKEND_UPDATE_FLAG,
  140. .sql = "UPDATE digests SET value = ?1, flag = ?2 WHERE "
  141. "digest==?3;",
  142. .args = "IID",
  143. .stmt = NULL,
  144. .result = SQLITE_DONE
  145. },
  146. {
  147. .idx = RSPAMD_FUZZY_BACKEND_INSERT_SHINGLE,
  148. .sql = "INSERT OR REPLACE INTO shingles(value, number, digest_id) "
  149. "VALUES (?1, ?2, ?3);",
  150. .args = "III",
  151. .stmt = NULL,
  152. .result = SQLITE_DONE
  153. },
  154. {
  155. .idx = RSPAMD_FUZZY_BACKEND_CHECK,
  156. .sql = "SELECT value, time, flag FROM digests WHERE digest==?1;",
  157. .args = "D",
  158. .stmt = NULL,
  159. .result = SQLITE_ROW
  160. },
  161. {
  162. .idx = RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE,
  163. .sql = "SELECT digest_id FROM shingles WHERE value=?1 AND number=?2",
  164. .args = "IS",
  165. .stmt = NULL,
  166. .result = SQLITE_ROW
  167. },
  168. {
  169. .idx = RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID,
  170. .sql = "SELECT digest, value, time, flag FROM digests WHERE id=?1",
  171. .args = "I",
  172. .stmt = NULL,
  173. .result = SQLITE_ROW
  174. },
  175. {
  176. .idx = RSPAMD_FUZZY_BACKEND_DELETE,
  177. .sql = "DELETE FROM digests WHERE digest==?1;",
  178. .args = "D",
  179. .stmt = NULL,
  180. .result = SQLITE_DONE
  181. },
  182. {
  183. .idx = RSPAMD_FUZZY_BACKEND_COUNT,
  184. .sql = "SELECT COUNT(*) FROM digests;",
  185. .args = "",
  186. .stmt = NULL,
  187. .result = SQLITE_ROW
  188. },
  189. {
  190. .idx = RSPAMD_FUZZY_BACKEND_EXPIRE,
  191. .sql = "DELETE FROM digests WHERE id IN (SELECT id FROM digests WHERE time < ?1 LIMIT ?2);",
  192. .args = "II",
  193. .stmt = NULL,
  194. .result = SQLITE_DONE
  195. },
  196. {
  197. .idx = RSPAMD_FUZZY_BACKEND_VACUUM,
  198. .sql = "VACUUM;",
  199. .args = "",
  200. .stmt = NULL,
  201. .result = SQLITE_DONE
  202. },
  203. {
  204. .idx = RSPAMD_FUZZY_BACKEND_DELETE_ORPHANED,
  205. .sql = "DELETE FROM shingles WHERE value=?1 AND number=?2;",
  206. .args = "II",
  207. .stmt = NULL,
  208. .result = SQLITE_DONE
  209. },
  210. };
  211. static GQuark
  212. rspamd_fuzzy_backend_quark(void)
  213. {
  214. return g_quark_from_static_string ("fuzzy-storage-backend");
  215. }
  216. static gboolean
  217. rspamd_fuzzy_backend_prepare_stmts (struct rspamd_fuzzy_backend *bk, GError **err)
  218. {
  219. int i;
  220. for (i = 0; i < RSPAMD_FUZZY_BACKEND_MAX; i ++) {
  221. if (prepared_stmts[i].stmt != NULL) {
  222. /* Skip already prepared statements */
  223. continue;
  224. }
  225. if (sqlite3_prepare_v2 (bk->db, prepared_stmts[i].sql, -1,
  226. &prepared_stmts[i].stmt, NULL) != SQLITE_OK) {
  227. g_set_error (err, rspamd_fuzzy_backend_quark (),
  228. -1, "Cannot initialize prepared sql `%s`: %s",
  229. prepared_stmts[i].sql, sqlite3_errmsg (bk->db));
  230. return FALSE;
  231. }
  232. }
  233. return TRUE;
  234. }
  235. static int
  236. rspamd_fuzzy_backend_cleanup_stmt (struct rspamd_fuzzy_backend *backend,
  237. int idx)
  238. {
  239. sqlite3_stmt *stmt;
  240. if (idx < 0 || idx >= RSPAMD_FUZZY_BACKEND_MAX) {
  241. return -1;
  242. }
  243. msg_debug_fuzzy_backend ("reseting `%s`", prepared_stmts[idx].sql);
  244. stmt = prepared_stmts[idx].stmt;
  245. sqlite3_clear_bindings (stmt);
  246. sqlite3_reset (stmt);
  247. return SQLITE_OK;
  248. }
  249. static int
  250. rspamd_fuzzy_backend_run_stmt (struct rspamd_fuzzy_backend *backend,
  251. gboolean auto_cleanup,
  252. int idx, ...)
  253. {
  254. int retcode;
  255. va_list ap;
  256. sqlite3_stmt *stmt;
  257. int i;
  258. const char *argtypes;
  259. guint retries = 0;
  260. struct timespec ts;
  261. if (idx < 0 || idx >= RSPAMD_FUZZY_BACKEND_MAX) {
  262. return -1;
  263. }
  264. stmt = prepared_stmts[idx].stmt;
  265. g_assert ((int)prepared_stmts[idx].idx == idx);
  266. if (stmt == NULL) {
  267. if ((retcode = sqlite3_prepare_v2 (backend->db, prepared_stmts[idx].sql, -1,
  268. &prepared_stmts[idx].stmt, NULL)) != SQLITE_OK) {
  269. msg_err_fuzzy_backend ("Cannot initialize prepared sql `%s`: %s",
  270. prepared_stmts[idx].sql, sqlite3_errmsg (backend->db));
  271. return retcode;
  272. }
  273. stmt = prepared_stmts[idx].stmt;
  274. }
  275. msg_debug_fuzzy_backend ("executing `%s` %s auto cleanup",
  276. prepared_stmts[idx].sql, auto_cleanup ? "with" : "without");
  277. argtypes = prepared_stmts[idx].args;
  278. sqlite3_clear_bindings (stmt);
  279. sqlite3_reset (stmt);
  280. va_start (ap, idx);
  281. for (i = 0; argtypes[i] != '\0'; i++) {
  282. switch (argtypes[i]) {
  283. case 'T':
  284. sqlite3_bind_text (stmt, i + 1, va_arg (ap, const char*), -1,
  285. SQLITE_STATIC);
  286. break;
  287. case 'I':
  288. sqlite3_bind_int64 (stmt, i + 1, va_arg (ap, gint64));
  289. break;
  290. case 'S':
  291. sqlite3_bind_int (stmt, i + 1, va_arg (ap, gint));
  292. break;
  293. case 'D':
  294. /* Special case for digests variable */
  295. sqlite3_bind_text (stmt, i + 1, va_arg (ap, const char*), 64,
  296. SQLITE_STATIC);
  297. break;
  298. }
  299. }
  300. va_end (ap);
  301. retry:
  302. retcode = sqlite3_step (stmt);
  303. if (retcode == prepared_stmts[idx].result) {
  304. retcode = SQLITE_OK;
  305. }
  306. else {
  307. if ((retcode == SQLITE_BUSY ||
  308. retcode == SQLITE_LOCKED) && retries++ < max_retries) {
  309. double_to_ts (sql_sleep_time, &ts);
  310. nanosleep (&ts, NULL);
  311. goto retry;
  312. }
  313. msg_debug_fuzzy_backend ("failed to execute query %s: %d, %s", prepared_stmts[idx].sql,
  314. retcode, sqlite3_errmsg (backend->db));
  315. }
  316. if (auto_cleanup) {
  317. sqlite3_clear_bindings (stmt);
  318. sqlite3_reset (stmt);
  319. }
  320. return retcode;
  321. }
  322. static void
  323. rspamd_fuzzy_backend_close_stmts (struct rspamd_fuzzy_backend *bk)
  324. {
  325. int i;
  326. for (i = 0; i < RSPAMD_FUZZY_BACKEND_MAX; i++) {
  327. if (prepared_stmts[i].stmt != NULL) {
  328. sqlite3_finalize (prepared_stmts[i].stmt);
  329. prepared_stmts[i].stmt = NULL;
  330. }
  331. }
  332. return;
  333. }
  334. static gboolean
  335. rspamd_fuzzy_backend_run_sql (const gchar *sql, struct rspamd_fuzzy_backend *bk,
  336. GError **err)
  337. {
  338. guint retries = 0;
  339. struct timespec ts;
  340. gint ret;
  341. do {
  342. ret = sqlite3_exec (bk->db, sql, NULL, NULL, NULL);
  343. double_to_ts (sql_sleep_time, &ts);
  344. } while (ret == SQLITE_BUSY && retries++ < max_retries &&
  345. nanosleep (&ts, NULL) == 0);
  346. if (ret != SQLITE_OK) {
  347. g_set_error (err, rspamd_fuzzy_backend_quark (),
  348. -1, "Cannot execute raw sql `%s`: %s",
  349. sql, sqlite3_errmsg (bk->db));
  350. return FALSE;
  351. }
  352. return TRUE;
  353. }
  354. static struct rspamd_fuzzy_backend *
  355. rspamd_fuzzy_backend_open_db (const gchar *path, GError **err)
  356. {
  357. struct rspamd_fuzzy_backend *bk;
  358. rspamd_cryptobox_hash_state_t st;
  359. guchar hash_out[rspamd_cryptobox_HASHBYTES];
  360. g_assert (path != NULL);
  361. bk = g_slice_alloc (sizeof (*bk));
  362. bk->path = g_strdup (path);
  363. bk->expired = 0;
  364. bk->pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), "fuzzy_backend");
  365. bk->db = rspamd_sqlite3_open_or_create (bk->pool, bk->path,
  366. create_tables_sql, err);
  367. if (bk->db == NULL) {
  368. rspamd_fuzzy_backend_close (bk);
  369. return NULL;
  370. }
  371. if (!rspamd_fuzzy_backend_prepare_stmts (bk, err)) {
  372. rspamd_fuzzy_backend_close (bk);
  373. return NULL;
  374. }
  375. /* Set id for the backend */
  376. rspamd_cryptobox_hash_init (&st, NULL, 0);
  377. rspamd_cryptobox_hash_update (&st, path, strlen (path));
  378. rspamd_cryptobox_hash_final (&st, hash_out);
  379. rspamd_snprintf (bk->id, sizeof (bk->id), "%xs", hash_out);
  380. memcpy (bk->pool->tag.uid, bk->id, sizeof (bk->pool->tag.uid));
  381. return bk;
  382. }
  383. struct rspamd_fuzzy_backend *
  384. rspamd_fuzzy_backend_open (const gchar *path,
  385. gboolean vacuum,
  386. GError **err)
  387. {
  388. struct rspamd_fuzzy_backend *backend;
  389. if (path == NULL) {
  390. g_set_error (err, rspamd_fuzzy_backend_quark (),
  391. ENOENT, "Path has not been specified");
  392. return NULL;
  393. }
  394. /* Open database */
  395. if ((backend = rspamd_fuzzy_backend_open_db (path, err)) == NULL) {
  396. return NULL;
  397. }
  398. if (rspamd_fuzzy_backend_run_stmt (backend, FALSE, RSPAMD_FUZZY_BACKEND_COUNT)
  399. == SQLITE_OK) {
  400. backend->count = sqlite3_column_int64 (
  401. prepared_stmts[RSPAMD_FUZZY_BACKEND_COUNT].stmt, 0);
  402. }
  403. rspamd_fuzzy_backend_cleanup_stmt (backend, RSPAMD_FUZZY_BACKEND_COUNT);
  404. return backend;
  405. }
  406. static gint
  407. rspamd_fuzzy_backend_int64_cmp (const void *a, const void *b)
  408. {
  409. gint64 ia = *(gint64 *)a, ib = *(gint64 *)b;
  410. return (ia - ib);
  411. }
  412. struct rspamd_fuzzy_reply
  413. rspamd_fuzzy_backend_check (struct rspamd_fuzzy_backend *backend,
  414. const struct rspamd_fuzzy_cmd *cmd, gint64 expire)
  415. {
  416. struct rspamd_fuzzy_reply rep = {0, 0, 0, 0.0};
  417. const struct rspamd_fuzzy_shingle_cmd *shcmd;
  418. int rc;
  419. gint64 timestamp;
  420. gint64 shingle_values[RSPAMD_SHINGLE_SIZE], i, sel_id, cur_id,
  421. cur_cnt, max_cnt;
  422. if (backend == NULL) {
  423. return rep;
  424. }
  425. /* Try direct match first of all */
  426. rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  427. RSPAMD_FUZZY_BACKEND_TRANSACTION_START);
  428. rc = rspamd_fuzzy_backend_run_stmt (backend, FALSE,
  429. RSPAMD_FUZZY_BACKEND_CHECK,
  430. cmd->digest);
  431. if (rc == SQLITE_OK) {
  432. timestamp = sqlite3_column_int64 (
  433. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, 1);
  434. if (time (NULL) - timestamp > expire) {
  435. /* Expire element */
  436. msg_debug_fuzzy_backend ("requested hash has been expired");
  437. }
  438. else {
  439. rep.value = sqlite3_column_int64 (
  440. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, 0);
  441. rep.prob = 1.0;
  442. rep.flag = sqlite3_column_int (
  443. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, 2);
  444. }
  445. }
  446. else if (cmd->shingles_count > 0) {
  447. /* Fuzzy match */
  448. rspamd_fuzzy_backend_cleanup_stmt (backend, RSPAMD_FUZZY_BACKEND_CHECK);
  449. shcmd = (const struct rspamd_fuzzy_shingle_cmd *)cmd;
  450. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  451. rc = rspamd_fuzzy_backend_run_stmt (backend, FALSE,
  452. RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE,
  453. shcmd->sgl.hashes[i], i);
  454. if (rc == SQLITE_OK) {
  455. shingle_values[i] = sqlite3_column_int64 (
  456. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE].stmt,
  457. 0);
  458. }
  459. else {
  460. shingle_values[i] = -1;
  461. }
  462. msg_debug_fuzzy_backend ("looking for shingle %L -> %L: %d", i,
  463. shcmd->sgl.hashes[i], rc);
  464. }
  465. rspamd_fuzzy_backend_cleanup_stmt (backend,
  466. RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE);
  467. qsort (shingle_values, RSPAMD_SHINGLE_SIZE, sizeof (gint64),
  468. rspamd_fuzzy_backend_int64_cmp);
  469. sel_id = -1;
  470. cur_id = -1;
  471. cur_cnt = 0;
  472. max_cnt = 0;
  473. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
  474. if (shingle_values[i] == -1) {
  475. continue;
  476. }
  477. /* We have some value here, so we need to check it */
  478. if (shingle_values[i] == cur_id) {
  479. cur_cnt ++;
  480. }
  481. else {
  482. cur_id = shingle_values[i];
  483. if (cur_cnt >= max_cnt) {
  484. max_cnt = cur_cnt;
  485. sel_id = cur_id;
  486. }
  487. cur_cnt = 0;
  488. }
  489. }
  490. if (cur_cnt > max_cnt) {
  491. max_cnt = cur_cnt;
  492. }
  493. if (sel_id != -1) {
  494. /* We have some id selected here */
  495. rep.prob = (float)max_cnt / (float)RSPAMD_SHINGLE_SIZE;
  496. if (rep.prob > 0.5) {
  497. msg_debug_fuzzy_backend (
  498. "found fuzzy hash with probability %.2f",
  499. rep.prob);
  500. rc = rspamd_fuzzy_backend_run_stmt (backend, FALSE,
  501. RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID, sel_id);
  502. if (rc == SQLITE_OK) {
  503. timestamp = sqlite3_column_int64 (
  504. prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt,
  505. 2);
  506. if (time (NULL) - timestamp > expire) {
  507. /* Expire element */
  508. msg_debug_fuzzy_backend (
  509. "requested hash has been expired");
  510. }
  511. else {
  512. rep.value = sqlite3_column_int64 (
  513. prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt,
  514. 1);
  515. rep.flag = sqlite3_column_int (
  516. prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt,
  517. 3);
  518. }
  519. }
  520. }
  521. else {
  522. /* Otherwise we assume that as error */
  523. rep.value = 0;
  524. }
  525. rspamd_fuzzy_backend_cleanup_stmt (backend,
  526. RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID);
  527. }
  528. }
  529. rspamd_fuzzy_backend_cleanup_stmt (backend, RSPAMD_FUZZY_BACKEND_CHECK);
  530. rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  531. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT);
  532. return rep;
  533. }
  534. gboolean
  535. rspamd_fuzzy_backend_prepare_update (struct rspamd_fuzzy_backend *backend)
  536. {
  537. gint rc;
  538. if (backend == NULL) {
  539. return FALSE;
  540. }
  541. rc = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  542. RSPAMD_FUZZY_BACKEND_TRANSACTION_START);
  543. if (rc != SQLITE_OK) {
  544. msg_warn_fuzzy_backend ("cannot start transaction for updates: %s",
  545. sqlite3_errmsg (backend->db));
  546. return FALSE;
  547. }
  548. return TRUE;
  549. }
  550. gboolean
  551. rspamd_fuzzy_backend_add (struct rspamd_fuzzy_backend *backend,
  552. const struct rspamd_fuzzy_cmd *cmd)
  553. {
  554. int rc, i;
  555. gint64 id, flag;
  556. const struct rspamd_fuzzy_shingle_cmd *shcmd;
  557. if (backend == NULL) {
  558. return FALSE;
  559. }
  560. rc = rspamd_fuzzy_backend_run_stmt (backend, FALSE,
  561. RSPAMD_FUZZY_BACKEND_CHECK,
  562. cmd->digest);
  563. if (rc == SQLITE_OK) {
  564. /* Check flag */
  565. flag = sqlite3_column_int64 (
  566. prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt,
  567. 2);
  568. rspamd_fuzzy_backend_cleanup_stmt (backend, RSPAMD_FUZZY_BACKEND_CHECK);
  569. if (flag == cmd->flag) {
  570. /* We need to increase weight */
  571. rc = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  572. RSPAMD_FUZZY_BACKEND_UPDATE,
  573. (gint64) cmd->value,
  574. cmd->digest);
  575. if (rc != SQLITE_OK) {
  576. msg_warn_fuzzy_backend ("cannot update hash to %d -> "
  577. "%*xs: %s", (gint) cmd->flag,
  578. (gint) sizeof (cmd->digest), cmd->digest,
  579. sqlite3_errmsg (backend->db));
  580. }
  581. }
  582. else {
  583. /* We need to relearn actually */
  584. rc = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  585. RSPAMD_FUZZY_BACKEND_UPDATE_FLAG,
  586. (gint64) cmd->value,
  587. (gint64) cmd->flag,
  588. cmd->digest);
  589. if (rc != SQLITE_OK) {
  590. msg_warn_fuzzy_backend ("cannot update hash to %d -> "
  591. "%*xs: %s", (gint) cmd->flag,
  592. (gint) sizeof (cmd->digest), cmd->digest,
  593. sqlite3_errmsg (backend->db));
  594. }
  595. }
  596. }
  597. else {
  598. rspamd_fuzzy_backend_cleanup_stmt (backend, RSPAMD_FUZZY_BACKEND_CHECK);
  599. rc = rspamd_fuzzy_backend_run_stmt (backend, FALSE,
  600. RSPAMD_FUZZY_BACKEND_INSERT,
  601. (gint) cmd->flag,
  602. cmd->digest,
  603. (gint64) cmd->value,
  604. (gint64) time (NULL));
  605. if (rc == SQLITE_OK) {
  606. if (cmd->shingles_count > 0) {
  607. id = sqlite3_last_insert_rowid (backend->db);
  608. shcmd = (const struct rspamd_fuzzy_shingle_cmd *) cmd;
  609. for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
  610. rc = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  611. RSPAMD_FUZZY_BACKEND_INSERT_SHINGLE,
  612. shcmd->sgl.hashes[i], (gint64)i, id);
  613. msg_debug_fuzzy_backend ("add shingle %d -> %L: %L",
  614. i,
  615. shcmd->sgl.hashes[i],
  616. id);
  617. if (rc != SQLITE_OK) {
  618. msg_warn_fuzzy_backend ("cannot add shingle %d -> "
  619. "%L: %L: %s", i,
  620. shcmd->sgl.hashes[i],
  621. id, sqlite3_errmsg (backend->db));
  622. }
  623. }
  624. }
  625. }
  626. else {
  627. msg_warn_fuzzy_backend ("cannot add hash to %d -> "
  628. "%*xs: %s", (gint)cmd->flag,
  629. (gint)sizeof (cmd->digest), cmd->digest,
  630. sqlite3_errmsg (backend->db));
  631. }
  632. rspamd_fuzzy_backend_cleanup_stmt (backend,
  633. RSPAMD_FUZZY_BACKEND_INSERT);
  634. }
  635. return (rc == SQLITE_OK);
  636. }
  637. gboolean
  638. rspamd_fuzzy_backend_finish_update (struct rspamd_fuzzy_backend *backend)
  639. {
  640. gint rc, wal_frames, wal_checkpointed;
  641. rc = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  642. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT);
  643. if (rc != SQLITE_OK) {
  644. msg_warn_fuzzy_backend ("cannot commit updates: %s",
  645. sqlite3_errmsg (backend->db));
  646. rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  647. RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK);
  648. return FALSE;
  649. }
  650. else {
  651. if (!rspamd_sqlite3_sync (backend->db, &wal_frames, &wal_checkpointed)) {
  652. msg_warn_fuzzy_backend ("cannot commit checkpoint: %s",
  653. sqlite3_errmsg (backend->db));
  654. }
  655. else if (wal_checkpointed > 0) {
  656. msg_info_fuzzy_backend ("total number of frames in the wal file: "
  657. "%d, checkpointed: %d", wal_frames, wal_checkpointed);
  658. }
  659. }
  660. return TRUE;
  661. }
  662. gboolean
  663. rspamd_fuzzy_backend_del (struct rspamd_fuzzy_backend *backend,
  664. const struct rspamd_fuzzy_cmd *cmd)
  665. {
  666. int rc;
  667. if (backend == NULL) {
  668. return FALSE;
  669. }
  670. rc = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  671. RSPAMD_FUZZY_BACKEND_DELETE,
  672. cmd->digest);
  673. return (rc == SQLITE_OK);
  674. }
  675. gboolean
  676. rspamd_fuzzy_backend_sync (struct rspamd_fuzzy_backend *backend,
  677. gint64 expire,
  678. gboolean clean_orphaned)
  679. {
  680. struct orphaned_shingle_elt {
  681. gint64 value;
  682. gint64 number;
  683. };
  684. /* Do not do more than 5k ops per step */
  685. const guint64 max_changes = 5000;
  686. gboolean ret = FALSE;
  687. gint64 expire_lim, expired;
  688. gint rc, i, orphaned_cnt = 0;
  689. GError *err = NULL;
  690. static const gchar orphaned_shingles[] = "SELECT shingles.value,shingles.number "
  691. "FROM shingles "
  692. "LEFT JOIN digests ON "
  693. "shingles.digest_id=digests.id WHERE "
  694. "digests.id IS NULL;";
  695. sqlite3_stmt *stmt;
  696. GArray *orphaned;
  697. struct orphaned_shingle_elt orphaned_elt, *pelt;
  698. if (backend == NULL) {
  699. return FALSE;
  700. }
  701. /* Perform expire */
  702. if (expire > 0) {
  703. expire_lim = time (NULL) - expire;
  704. if (expire_lim > 0) {
  705. ret = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  706. RSPAMD_FUZZY_BACKEND_TRANSACTION_START);
  707. if (ret == SQLITE_OK) {
  708. rc = rspamd_fuzzy_backend_run_stmt (backend, FALSE,
  709. RSPAMD_FUZZY_BACKEND_EXPIRE, expire_lim, max_changes);
  710. if (rc == SQLITE_OK) {
  711. expired = sqlite3_changes (backend->db);
  712. if (expired > 0) {
  713. backend->expired += expired;
  714. msg_info_fuzzy_backend ("expired %L hashes", expired);
  715. }
  716. }
  717. else {
  718. msg_warn_fuzzy_backend (
  719. "cannot execute expired statement: %s",
  720. sqlite3_errmsg (backend->db));
  721. }
  722. rspamd_fuzzy_backend_cleanup_stmt (backend,
  723. RSPAMD_FUZZY_BACKEND_EXPIRE);
  724. ret = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  725. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT);
  726. if (ret != SQLITE_OK) {
  727. rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  728. RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK);
  729. }
  730. }
  731. if (ret != SQLITE_OK) {
  732. msg_warn_fuzzy_backend ("cannot expire db: %s",
  733. sqlite3_errmsg (backend->db));
  734. }
  735. }
  736. }
  737. /* Cleanup database */
  738. if (clean_orphaned) {
  739. ret = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  740. RSPAMD_FUZZY_BACKEND_TRANSACTION_START);
  741. if (ret == SQLITE_OK) {
  742. if ((rc = sqlite3_prepare_v2 (backend->db,
  743. orphaned_shingles,
  744. -1,
  745. &stmt,
  746. NULL)) != SQLITE_OK) {
  747. msg_warn_fuzzy_backend ("cannot cleanup shingles: %s",
  748. sqlite3_errmsg (backend->db));
  749. }
  750. else {
  751. orphaned = g_array_new (FALSE,
  752. FALSE,
  753. sizeof (struct orphaned_shingle_elt));
  754. while (sqlite3_step (stmt) == SQLITE_ROW) {
  755. orphaned_elt.value = sqlite3_column_int64 (stmt, 0);
  756. orphaned_elt.number = sqlite3_column_int64 (stmt, 1);
  757. g_array_append_val (orphaned, orphaned_elt);
  758. if (orphaned->len > max_changes) {
  759. break;
  760. }
  761. }
  762. sqlite3_finalize (stmt);
  763. orphaned_cnt = orphaned->len;
  764. if (orphaned_cnt > 0) {
  765. msg_info_fuzzy_backend (
  766. "going to delete %ud orphaned shingles",
  767. orphaned_cnt);
  768. /* Need to delete orphaned elements */
  769. for (i = 0; i < (gint) orphaned_cnt; i++) {
  770. pelt = &g_array_index (orphaned,
  771. struct orphaned_shingle_elt,
  772. i);
  773. rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  774. RSPAMD_FUZZY_BACKEND_DELETE_ORPHANED,
  775. pelt->value, pelt->number);
  776. }
  777. }
  778. g_array_free (orphaned, TRUE);
  779. }
  780. ret = rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  781. RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT);
  782. if (ret == SQLITE_OK) {
  783. msg_info_fuzzy_backend (
  784. "deleted %ud orphaned shingles",
  785. orphaned_cnt);
  786. }
  787. else {
  788. msg_warn_fuzzy_backend (
  789. "cannot synchronize fuzzy backend: %e",
  790. err);
  791. rspamd_fuzzy_backend_run_stmt (backend, TRUE,
  792. RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK);
  793. }
  794. }
  795. }
  796. return ret;
  797. }
  798. void
  799. rspamd_fuzzy_backend_close (struct rspamd_fuzzy_backend *backend)
  800. {
  801. if (backend != NULL) {
  802. if (backend->db != NULL) {
  803. rspamd_fuzzy_backend_close_stmts (backend);
  804. sqlite3_close (backend->db);
  805. }
  806. if (backend->path != NULL) {
  807. g_free (backend->path);
  808. }
  809. if (backend->pool) {
  810. rspamd_mempool_delete (backend->pool);
  811. }
  812. g_slice_free1 (sizeof (*backend), backend);
  813. }
  814. }
  815. gsize
  816. rspamd_fuzzy_backend_count (struct rspamd_fuzzy_backend *backend)
  817. {
  818. if (backend) {
  819. if (rspamd_fuzzy_backend_run_stmt (backend, FALSE,
  820. RSPAMD_FUZZY_BACKEND_COUNT) == SQLITE_OK) {
  821. backend->count = sqlite3_column_int64 (
  822. prepared_stmts[RSPAMD_FUZZY_BACKEND_COUNT].stmt, 0);
  823. }
  824. rspamd_fuzzy_backend_cleanup_stmt (backend, RSPAMD_FUZZY_BACKEND_COUNT);
  825. return backend->count;
  826. }
  827. return 0;
  828. }
  829. gsize
  830. rspamd_fuzzy_backend_expired (struct rspamd_fuzzy_backend *backend)
  831. {
  832. return backend != NULL ? backend->expired : 0;
  833. }
  834. const gchar *
  835. rspamd_fuzzy_backend_id (struct rspamd_fuzzy_backend *backend)
  836. {
  837. return backend != NULL ? backend->id : 0;
  838. }