You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

multipattern.c 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "libutil/multipattern.h"
  18. #include "libutil/str_util.h"
  19. #include "libcryptobox/cryptobox.h"
  20. #ifdef WITH_HYPERSCAN
  21. #include "logger.h"
  22. #include "unix-std.h"
  23. #include "hs.h"
  24. #endif
  25. #include "acism.h"
  26. #define MAX_SCRATCH 4
  27. enum rspamd_hs_check_state {
  28. RSPAMD_HS_UNCHECKED = 0,
  29. RSPAMD_HS_SUPPORTED,
  30. RSPAMD_HS_UNSUPPORTED
  31. };
  32. static const char *hs_cache_dir = NULL;
  33. static enum rspamd_hs_check_state hs_suitable_cpu = RSPAMD_HS_UNCHECKED;
  34. struct rspamd_multipattern {
  35. #ifdef WITH_HYPERSCAN
  36. hs_database_t *db;
  37. hs_scratch_t *scratch[MAX_SCRATCH];
  38. GArray *hs_pats;
  39. GArray *hs_ids;
  40. GArray *hs_flags;
  41. rspamd_cryptobox_hash_state_t hash_state;
  42. guint scratch_used;
  43. #endif
  44. ac_trie_t *t;
  45. GArray *pats;
  46. gboolean compiled;
  47. guint cnt;
  48. enum rspamd_multipattern_flags flags;
  49. };
  50. static GQuark
  51. rspamd_multipattern_quark (void)
  52. {
  53. return g_quark_from_static_string ("multipattern");
  54. }
  55. static inline gboolean
  56. rspamd_hs_check (void)
  57. {
  58. #ifdef WITH_HYPERSCAN
  59. if (G_UNLIKELY (hs_suitable_cpu == RSPAMD_HS_UNCHECKED)) {
  60. if (hs_valid_platform () == HS_SUCCESS) {
  61. hs_suitable_cpu = RSPAMD_HS_SUPPORTED;
  62. }
  63. else {
  64. hs_suitable_cpu = RSPAMD_HS_UNSUPPORTED;
  65. }
  66. }
  67. #endif
  68. return hs_suitable_cpu == RSPAMD_HS_SUPPORTED;
  69. }
  70. void
  71. rspamd_multipattern_library_init (const gchar *cache_dir)
  72. {
  73. hs_cache_dir = cache_dir;
  74. #ifdef WITH_HYPERSCAN
  75. rspamd_hs_check ();
  76. #endif
  77. }
  78. #ifdef WITH_HYPERSCAN
  79. static gchar *
  80. rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
  81. gsize *dst_len)
  82. {
  83. gsize len;
  84. const gchar *p, *prefix;
  85. gchar *res;
  86. /*
  87. * We understand the following cases
  88. * 1) blah -> .blah
  89. * 2) *.blah -> ..*\\.blah
  90. * 3) ???
  91. */
  92. if (pattern[0] == '*') {
  93. len = slen + 4;
  94. p = strchr (pattern, '.');
  95. if (p == NULL) {
  96. /* XXX: bad */
  97. p = pattern;
  98. }
  99. else {
  100. p ++;
  101. }
  102. prefix = ".*.";
  103. }
  104. else {
  105. len = slen + 1;
  106. prefix = ".";
  107. p = pattern;
  108. }
  109. res = g_malloc (len + 1);
  110. slen = rspamd_strlcpy (res, prefix, len + 1);
  111. slen += rspamd_strlcpy (res + slen, p, len + 1 - slen);
  112. *dst_len = slen;
  113. return res;
  114. }
  115. #endif
  116. static gchar *
  117. rspamd_multipattern_escape_tld_acism (const gchar *pattern, gsize len,
  118. gsize *dst_len)
  119. {
  120. gsize dlen, slen;
  121. const gchar *p, *prefix;
  122. gchar *res;
  123. /*
  124. * We understand the following cases
  125. * 1) blah -> \\.blah
  126. * 2) *.blah -> \\..*\\.blah
  127. * 3) ???
  128. */
  129. slen = len;
  130. if (pattern[0] == '*') {
  131. dlen = slen;
  132. p = memchr (pattern, '.', len);
  133. if (p == NULL) {
  134. /* XXX: bad */
  135. p = pattern;
  136. }
  137. else {
  138. p ++;
  139. }
  140. dlen -= p - pattern;
  141. prefix = ".";
  142. dlen ++;
  143. }
  144. else {
  145. dlen = slen + 1;
  146. prefix = ".";
  147. p = pattern;
  148. }
  149. res = g_malloc (dlen + 1);
  150. slen = strlen (prefix);
  151. memcpy (res, prefix, slen);
  152. rspamd_strlcpy (res + slen, p, dlen - slen + 1);
  153. *dst_len = dlen;
  154. return res;
  155. }
  156. /*
  157. * Escapes special characters from specific pattern
  158. */
  159. static gchar *
  160. rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len,
  161. enum rspamd_multipattern_flags flags,
  162. gsize *dst_len)
  163. {
  164. gchar *ret = NULL;
  165. #ifdef WITH_HYPERSCAN
  166. if (rspamd_hs_check ()) {
  167. gint gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII;
  168. if (flags & RSPAMD_MULTIPATTERN_UTF8) {
  169. gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
  170. }
  171. if (flags & RSPAMD_MULTIPATTERN_TLD) {
  172. gchar *tmp;
  173. gsize tlen;
  174. tmp = rspamd_multipattern_escape_tld_hyperscan (pattern, len, &tlen);
  175. ret = rspamd_str_regexp_escape (tmp, tlen, dst_len,
  176. gl_flags|RSPAMD_REGEXP_ESCAPE_GLOB);
  177. g_free (tmp);
  178. }
  179. else if (flags & RSPAMD_MULTIPATTERN_RE) {
  180. ret = rspamd_str_regexp_escape (pattern, len, dst_len, gl_flags |
  181. RSPAMD_REGEXP_ESCAPE_RE);
  182. }
  183. else if (flags & RSPAMD_MULTIPATTERN_GLOB) {
  184. ret = rspamd_str_regexp_escape (pattern, len, dst_len,
  185. gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB);
  186. }
  187. else {
  188. ret = rspamd_str_regexp_escape (pattern, len, dst_len, gl_flags);
  189. }
  190. return ret;
  191. }
  192. #endif
  193. if (flags & RSPAMD_MULTIPATTERN_TLD) {
  194. ret = rspamd_multipattern_escape_tld_acism (pattern, len, dst_len);
  195. }
  196. else {
  197. ret = malloc (len + 1);
  198. *dst_len = rspamd_strlcpy (ret, pattern, len + 1);
  199. }
  200. return ret;
  201. }
  202. struct rspamd_multipattern *
  203. rspamd_multipattern_create (enum rspamd_multipattern_flags flags)
  204. {
  205. struct rspamd_multipattern *mp;
  206. mp = g_malloc0 (sizeof (*mp));
  207. mp->flags = flags;
  208. #ifdef WITH_HYPERSCAN
  209. if (rspamd_hs_check ()) {
  210. mp->hs_pats = g_array_new (FALSE, TRUE, sizeof (gchar *));
  211. mp->hs_flags = g_array_new (FALSE, TRUE, sizeof (gint));
  212. mp->hs_ids = g_array_new (FALSE, TRUE, sizeof (gint));
  213. rspamd_cryptobox_hash_init (&mp->hash_state, NULL, 0);
  214. return mp;
  215. }
  216. #endif
  217. mp->pats = g_array_new (FALSE, TRUE, sizeof (ac_trie_pat_t));
  218. return mp;
  219. }
  220. struct rspamd_multipattern *
  221. rspamd_multipattern_create_sized (guint npatterns,
  222. enum rspamd_multipattern_flags flags)
  223. {
  224. struct rspamd_multipattern *mp;
  225. mp = g_malloc0 (sizeof (*mp));
  226. mp->flags = flags;
  227. #ifdef WITH_HYPERSCAN
  228. if (rspamd_hs_check ()) {
  229. mp->hs_pats = g_array_sized_new (FALSE, TRUE, sizeof (gchar *), npatterns);
  230. mp->hs_flags = g_array_sized_new (FALSE, TRUE, sizeof (gint), npatterns);
  231. mp->hs_ids = g_array_sized_new (FALSE, TRUE, sizeof (gint), npatterns);
  232. rspamd_cryptobox_hash_init (&mp->hash_state, NULL, 0);
  233. return mp;
  234. }
  235. #endif
  236. mp->pats = g_array_sized_new (FALSE, TRUE, sizeof (ac_trie_pat_t), npatterns);
  237. return mp;
  238. }
  239. void
  240. rspamd_multipattern_add_pattern (struct rspamd_multipattern *mp,
  241. const gchar *pattern, gint flags)
  242. {
  243. g_assert (pattern != NULL);
  244. rspamd_multipattern_add_pattern_len (mp, pattern, strlen (pattern), flags);
  245. }
  246. void
  247. rspamd_multipattern_add_pattern_len (struct rspamd_multipattern *mp,
  248. const gchar *pattern, gsize patlen, gint flags)
  249. {
  250. gsize dlen;
  251. g_assert (pattern != NULL);
  252. g_assert (mp != NULL);
  253. g_assert (!mp->compiled);
  254. #ifdef WITH_HYPERSCAN
  255. if (rspamd_hs_check ()) {
  256. gchar *np;
  257. gint fl = HS_FLAG_SOM_LEFTMOST;
  258. if (mp->flags & RSPAMD_MULTIPATTERN_ICASE) {
  259. fl |= HS_FLAG_CASELESS;
  260. }
  261. if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) {
  262. fl |= HS_FLAG_UTF8|HS_FLAG_UCP;
  263. }
  264. g_array_append_val (mp->hs_flags, fl);
  265. np = rspamd_multipattern_pattern_filter (pattern, patlen, flags, &dlen);
  266. g_array_append_val (mp->hs_pats, np);
  267. fl = mp->cnt;
  268. g_array_append_val (mp->hs_ids, fl);
  269. rspamd_cryptobox_hash_update (&mp->hash_state, np, dlen);
  270. mp->cnt ++;
  271. return;
  272. }
  273. #endif
  274. ac_trie_pat_t pat;
  275. pat.ptr = rspamd_multipattern_pattern_filter (pattern, patlen, flags, &dlen);
  276. pat.len = dlen;
  277. g_array_append_val (mp->pats, pat);
  278. mp->cnt ++;
  279. }
  280. struct rspamd_multipattern *
  281. rspamd_multipattern_create_full (const gchar **patterns,
  282. guint npatterns, enum rspamd_multipattern_flags flags)
  283. {
  284. struct rspamd_multipattern *mp;
  285. guint i;
  286. g_assert (npatterns > 0);
  287. g_assert (patterns != NULL);
  288. mp = rspamd_multipattern_create_sized (npatterns, flags);
  289. for (i = 0; i < npatterns; i++) {
  290. rspamd_multipattern_add_pattern (mp, patterns[i], flags);
  291. }
  292. return mp;
  293. }
  294. #ifdef WITH_HYPERSCAN
  295. static gboolean
  296. rspamd_multipattern_try_load_hs (struct rspamd_multipattern *mp,
  297. const guchar *hash)
  298. {
  299. gchar fp[PATH_MAX];
  300. gpointer map;
  301. gsize len;
  302. if (hs_cache_dir == NULL) {
  303. return FALSE;
  304. }
  305. rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmp", hs_cache_dir,
  306. (gint)rspamd_cryptobox_HASHBYTES / 2, hash);
  307. if ((map = rspamd_file_xmap (fp, PROT_READ, &len, TRUE)) != NULL) {
  308. if (hs_deserialize_database (map, len, &mp->db) == HS_SUCCESS) {
  309. munmap (map, len);
  310. return TRUE;
  311. }
  312. munmap (map, len);
  313. /* Remove stale file */
  314. (void)unlink (fp);
  315. }
  316. return FALSE;
  317. }
  318. static void
  319. rspamd_multipattern_try_save_hs (struct rspamd_multipattern *mp,
  320. const guchar *hash)
  321. {
  322. gchar fp[PATH_MAX], np[PATH_MAX];
  323. char *bytes = NULL;
  324. gsize len;
  325. gint fd;
  326. if (hs_cache_dir == NULL) {
  327. return;
  328. }
  329. rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmp.tmp", hs_cache_dir,
  330. (gint)rspamd_cryptobox_HASHBYTES / 2, hash);
  331. if ((fd = rspamd_file_xopen (fp, O_WRONLY | O_CREAT | O_EXCL, 00644, 0)) != -1) {
  332. if (hs_serialize_database (mp->db, &bytes, &len) == HS_SUCCESS) {
  333. if (write (fd, bytes, len) == -1) {
  334. msg_warn ("cannot write hyperscan cache to %s: %s",
  335. fp, strerror (errno));
  336. unlink (fp);
  337. free (bytes);
  338. }
  339. else {
  340. free (bytes);
  341. fsync (fd);
  342. rspamd_snprintf (np, sizeof (np), "%s/%*xs.hsmp", hs_cache_dir,
  343. (gint)rspamd_cryptobox_HASHBYTES / 2, hash);
  344. if (rename (fp, np) == -1) {
  345. msg_warn ("cannot rename hyperscan cache from %s to %s: %s",
  346. fp, np, strerror (errno));
  347. unlink (fp);
  348. }
  349. }
  350. }
  351. else {
  352. msg_warn ("cannot serialize hyperscan cache to %s: %s",
  353. fp, strerror (errno));
  354. unlink (fp);
  355. }
  356. close (fd);
  357. }
  358. }
  359. #endif
  360. gboolean
  361. rspamd_multipattern_compile (struct rspamd_multipattern *mp, GError **err)
  362. {
  363. g_assert (mp != NULL);
  364. g_assert (!mp->compiled);
  365. #ifdef WITH_HYPERSCAN
  366. if (rspamd_hs_check ()) {
  367. guint i;
  368. hs_platform_info_t plt;
  369. hs_compile_error_t *hs_errors;
  370. guchar hash[rspamd_cryptobox_HASHBYTES];
  371. if (mp->cnt > 0) {
  372. g_assert (hs_populate_platform (&plt) == HS_SUCCESS);
  373. rspamd_cryptobox_hash_update (&mp->hash_state, (void *)&plt, sizeof (plt));
  374. rspamd_cryptobox_hash_final (&mp->hash_state, hash);
  375. if (!rspamd_multipattern_try_load_hs (mp, hash)) {
  376. if (hs_compile_multi ((const char *const *)mp->hs_pats->data,
  377. (const unsigned int *)mp->hs_flags->data,
  378. (const unsigned int *)mp->hs_ids->data,
  379. mp->cnt,
  380. HS_MODE_BLOCK,
  381. &plt,
  382. &mp->db,
  383. &hs_errors) != HS_SUCCESS) {
  384. g_set_error (err, rspamd_multipattern_quark (), EINVAL,
  385. "cannot create tree of regexp when processing '%s': %s",
  386. g_array_index (mp->hs_pats, char *, hs_errors->expression),
  387. hs_errors->message);
  388. hs_free_compile_error (hs_errors);
  389. return FALSE;
  390. }
  391. }
  392. rspamd_multipattern_try_save_hs (mp, hash);
  393. for (i = 0; i < MAX_SCRATCH; i ++) {
  394. g_assert (hs_alloc_scratch (mp->db, &mp->scratch[i]) == HS_SUCCESS);
  395. }
  396. }
  397. mp->compiled = TRUE;
  398. return TRUE;
  399. }
  400. #endif
  401. if (mp->cnt > 0) {
  402. mp->t = acism_create ((const ac_trie_pat_t *)mp->pats->data, mp->cnt);
  403. }
  404. mp->compiled = TRUE;
  405. return TRUE;
  406. }
  407. struct rspamd_multipattern_cbdata {
  408. struct rspamd_multipattern *mp;
  409. const gchar *in;
  410. gsize len;
  411. rspamd_multipattern_cb_t cb;
  412. gpointer ud;
  413. guint nfound;
  414. gint ret;
  415. };
  416. #ifdef WITH_HYPERSCAN
  417. static gint
  418. rspamd_multipattern_hs_cb (unsigned int id,
  419. unsigned long long from,
  420. unsigned long long to,
  421. unsigned int flags,
  422. void *ud)
  423. {
  424. struct rspamd_multipattern_cbdata *cbd = ud;
  425. gint ret = 0;
  426. if (to > 0) {
  427. if (from == HS_OFFSET_PAST_HORIZON) {
  428. from = 0;
  429. }
  430. ret = cbd->cb (cbd->mp, id, from, to, cbd->in, cbd->len, cbd->ud);
  431. cbd->nfound ++;
  432. cbd->ret = ret;
  433. }
  434. return ret;
  435. }
  436. #endif
  437. static gint
  438. rspamd_multipattern_acism_cb (int strnum, int textpos, void *context)
  439. {
  440. struct rspamd_multipattern_cbdata *cbd = context;
  441. gint ret;
  442. ac_trie_pat_t pat;
  443. pat = g_array_index (cbd->mp->pats, ac_trie_pat_t, strnum);
  444. ret = cbd->cb (cbd->mp, strnum, textpos - pat.len,
  445. textpos, cbd->in, cbd->len, cbd->ud);
  446. cbd->nfound ++;
  447. cbd->ret = ret;
  448. return ret;
  449. }
  450. gint
  451. rspamd_multipattern_lookup (struct rspamd_multipattern *mp,
  452. const gchar *in, gsize len, rspamd_multipattern_cb_t cb,
  453. gpointer ud, guint *pnfound)
  454. {
  455. struct rspamd_multipattern_cbdata cbd;
  456. gint ret = 0;
  457. g_assert (mp != NULL);
  458. if (mp->cnt == 0 || !mp->compiled) {
  459. return 0;
  460. }
  461. cbd.mp = mp;
  462. cbd.in = in;
  463. cbd.len = len;
  464. cbd.cb = cb;
  465. cbd.ud = ud;
  466. cbd.nfound = 0;
  467. cbd.ret = 0;
  468. #ifdef WITH_HYPERSCAN
  469. if (rspamd_hs_check ()) {
  470. hs_scratch_t *scr = NULL;
  471. guint i;
  472. for (i = 0; i < MAX_SCRATCH; i ++) {
  473. if (!(mp->scratch_used & (1 << i))) {
  474. mp->scratch_used |= (1 << i);
  475. scr = mp->scratch[i];
  476. break;
  477. }
  478. }
  479. g_assert (scr != NULL);
  480. ret = hs_scan (mp->db, in, len, 0, scr,
  481. rspamd_multipattern_hs_cb, &cbd);
  482. mp->scratch_used &= ~(1 << i);
  483. if (ret == HS_SUCCESS) {
  484. ret = 0;
  485. }
  486. else if (ret == HS_SCAN_TERMINATED) {
  487. ret = cbd.ret;
  488. }
  489. if (pnfound) {
  490. *pnfound = cbd.nfound;
  491. }
  492. return ret;
  493. }
  494. #endif
  495. gint state = 0;
  496. ret = acism_lookup (mp->t, in, len, rspamd_multipattern_acism_cb, &cbd,
  497. &state, mp->flags & RSPAMD_MULTIPATTERN_ICASE);
  498. if (pnfound) {
  499. *pnfound = cbd.nfound;
  500. }
  501. return ret;
  502. }
  503. void
  504. rspamd_multipattern_destroy (struct rspamd_multipattern *mp)
  505. {
  506. guint i;
  507. if (mp) {
  508. #ifdef WITH_HYPERSCAN
  509. if (rspamd_hs_check ()) {
  510. gchar *p;
  511. if (mp->compiled && mp->cnt > 0) {
  512. for (i = 0; i < MAX_SCRATCH; i ++) {
  513. hs_free_scratch (mp->scratch[i]);
  514. }
  515. hs_free_database (mp->db);
  516. }
  517. for (i = 0; i < mp->cnt; i ++) {
  518. p = g_array_index (mp->hs_pats, gchar *, i);
  519. g_free (p);
  520. }
  521. g_array_free (mp->hs_pats, TRUE);
  522. g_array_free (mp->hs_ids, TRUE);
  523. g_array_free (mp->hs_flags, TRUE);
  524. g_free (mp);
  525. return;
  526. }
  527. #endif
  528. ac_trie_pat_t pat;
  529. if (mp->compiled && mp->cnt > 0) {
  530. acism_destroy (mp->t);
  531. }
  532. for (i = 0; i < mp->cnt; i ++) {
  533. pat = g_array_index (mp->pats, ac_trie_pat_t, i);
  534. g_free ((gchar *)pat.ptr);
  535. }
  536. g_array_free (mp->pats, TRUE);
  537. g_free (mp);
  538. }
  539. }
  540. const gchar*
  541. rspamd_multipattern_get_pattern (struct rspamd_multipattern *mp,
  542. guint index)
  543. {
  544. g_assert (mp != NULL);
  545. g_assert (index < mp->cnt);
  546. #ifdef WITH_HYPERSCAN
  547. if (rspamd_hs_check ()) {
  548. return g_array_index (mp->hs_pats, gchar *, index);
  549. }
  550. #endif
  551. ac_trie_pat_t pat;
  552. pat = g_array_index (mp->pats, ac_trie_pat_t, index);
  553. return pat.ptr;
  554. }
  555. guint
  556. rspamd_multipattern_get_npatterns (struct rspamd_multipattern *mp)
  557. {
  558. g_assert (mp != NULL);
  559. return mp->cnt;
  560. }
  561. gboolean
  562. rspamd_multipattern_has_hyperscan (void)
  563. {
  564. return rspamd_hs_check ();
  565. }